docx_file.py 902 B

1234567891011121314151617181920212223242526
  1. import hashlib
  2. try:
  3. from langchain_community.document_loaders import Docx2txtLoader
  4. except ImportError:
  5. raise ImportError("Docx file requires extra dependencies. Install with `pip install docx2txt==0.8`") from None
  6. from embedchain.helpers.json_serializable import register_deserializable
  7. from embedchain.loaders.base_loader import BaseLoader
  8. @register_deserializable
  9. class DocxFileLoader(BaseLoader):
  10. def load_data(self, url):
  11. """Load data from a .docx file."""
  12. loader = Docx2txtLoader(url)
  13. output = []
  14. data = loader.load()
  15. content = data[0].page_content
  16. metadata = data[0].metadata
  17. metadata["url"] = "local"
  18. output.append({"content": content, "meta_data": metadata})
  19. doc_id = hashlib.sha256((content + url).encode()).hexdigest()
  20. return {
  21. "doc_id": doc_id,
  22. "data": output,
  23. }