docx_file.py 938 B

12345678910111213141516171819202122232425262728
  1. import hashlib
  2. try:
  3. from langchain_community.document_loaders import Docx2txtLoader
  4. except ImportError:
  5. raise ImportError(
  6. 'Docx file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
  7. ) from None
  8. from embedchain.helpers.json_serializable import register_deserializable
  9. from embedchain.loaders.base_loader import BaseLoader
  10. @register_deserializable
  11. class DocxFileLoader(BaseLoader):
  12. def load_data(self, url):
  13. """Load data from a .docx file."""
  14. loader = Docx2txtLoader(url)
  15. output = []
  16. data = loader.load()
  17. content = data[0].page_content
  18. metadata = data[0].metadata
  19. metadata["url"] = "local"
  20. output.append({"content": content, "meta_data": metadata})
  21. doc_id = hashlib.sha256((content + url).encode()).hexdigest()
  22. return {
  23. "doc_id": doc_id,
  24. "data": output,
  25. }