unstructured_file.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. import hashlib
  2. from embedchain.helpers.json_serializable import register_deserializable
  3. from embedchain.loaders.base_loader import BaseLoader
  4. from embedchain.utils.misc import clean_string
  5. @register_deserializable
  6. class UnstructuredLoader(BaseLoader):
  7. def load_data(self, url):
  8. """Load data from an Unstructured file."""
  9. try:
  10. import unstructured # noqa: F401
  11. from langchain_community.document_loaders import \
  12. UnstructuredFileLoader
  13. except ImportError:
  14. raise ImportError(
  15. 'Unstructured file requires extra dependencies. Install with `pip install "unstructured[local-inference, all-docs]"`' # noqa: E501
  16. ) from None
  17. loader = UnstructuredFileLoader(url)
  18. data = []
  19. all_content = []
  20. pages = loader.load_and_split()
  21. if not len(pages):
  22. raise ValueError("No data found")
  23. for page in pages:
  24. content = page.page_content
  25. content = clean_string(content)
  26. metadata = page.metadata
  27. metadata["url"] = url
  28. data.append(
  29. {
  30. "content": content,
  31. "meta_data": metadata,
  32. }
  33. )
  34. all_content.append(content)
  35. doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
  36. return {
  37. "doc_id": doc_id,
  38. "data": data,
  39. }