docx_file.py 751 B

123456789101112131415161718192021222324
  1. import hashlib
  2. from langchain.document_loaders import Docx2txtLoader
  3. from embedchain.helper.json_serializable import register_deserializable
  4. from embedchain.loaders.base_loader import BaseLoader
  5. @register_deserializable
  6. class DocxFileLoader(BaseLoader):
  7. def load_data(self, url):
  8. """Load data from a .docx file."""
  9. loader = Docx2txtLoader(url)
  10. output = []
  11. data = loader.load()
  12. content = data[0].page_content
  13. meta_data = data[0].metadata
  14. meta_data["url"] = "local"
  15. output.append({"content": content, "meta_data": meta_data})
  16. doc_id = hashlib.sha256((content + url).encode()).hexdigest()
  17. return {
  18. "doc_id": doc_id,
  19. "data": output,
  20. }