unstructured_file.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. import hashlib
  2. from embedchain.helper.json_serializable import register_deserializable
  3. from embedchain.loaders.base_loader import BaseLoader
  4. from embedchain.utils import clean_string
  5. @register_deserializable
  6. class UnstructuredLoader(BaseLoader):
  7. def load_data(self, url):
  8. """Load data from a Unstructured file."""
  9. try:
  10. from langchain.document_loaders import UnstructuredFileLoader
  11. except ImportError:
  12. raise ImportError(
  13. 'Unstructured file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' # noqa: E501
  14. ) from None
  15. loader = UnstructuredFileLoader(url)
  16. data = []
  17. all_content = []
  18. pages = loader.load_and_split()
  19. if not len(pages):
  20. raise ValueError("No data found")
  21. for page in pages:
  22. content = page.page_content
  23. content = clean_string(content)
  24. meta_data = page.metadata
  25. meta_data["url"] = url
  26. data.append(
  27. {
  28. "content": content,
  29. "meta_data": meta_data,
  30. }
  31. )
  32. all_content.append(content)
  33. doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
  34. return {
  35. "doc_id": doc_id,
  36. "data": data,
  37. }