excel_file.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. import hashlib
  2. import importlib.util
  3. try:
  4. from langchain_community.document_loaders import UnstructuredExcelLoader
  5. except ImportError:
  6. raise ImportError(
  7. 'Excel file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
  8. ) from None
  9. if importlib.util.find_spec("openpyxl") is None and importlib.util.find_spec("xlrd") is None:
  10. raise ImportError("Excel file requires extra dependencies. Install with `pip install openpyxl xlrd`") from None
  11. from embedchain.helpers.json_serializable import register_deserializable
  12. from embedchain.loaders.base_loader import BaseLoader
  13. from embedchain.utils.misc import clean_string
  14. @register_deserializable
  15. class ExcelFileLoader(BaseLoader):
  16. def load_data(self, excel_url):
  17. """Load data from a Excel file."""
  18. loader = UnstructuredExcelLoader(excel_url)
  19. pages = loader.load_and_split()
  20. data = []
  21. for page in pages:
  22. content = page.page_content
  23. content = clean_string(content)
  24. metadata = page.metadata
  25. metadata["url"] = excel_url
  26. data.append({"content": content, "meta_data": metadata})
  27. doc_id = hashlib.sha256((content + excel_url).encode()).hexdigest()
  28. return {
  29. "doc_id": doc_id,
  30. "data": data,
  31. }