docx_file.py 610 B

123456789101112131415161718
  1. from langchain.document_loaders import Docx2txtLoader
  2. from embedchain.helper_classes.json_serializable import register_deserializable
  3. from embedchain.loaders.base_loader import BaseLoader
  4. @register_deserializable
  5. class DocxFileLoader(BaseLoader):
  6. def load_data(self, url):
  7. """Load data from a .docx file."""
  8. loader = Docx2txtLoader(url)
  9. output = []
  10. data = loader.load()
  11. content = data[0].page_content
  12. meta_data = data[0].metadata
  13. meta_data["url"] = "local"
  14. output.append({"content": content, "meta_data": meta_data})
  15. return output