unstructured_file.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. import hashlib
  2. from embedchain.helpers.json_serializable import register_deserializable
  3. from embedchain.loaders.base_loader import BaseLoader
  4. from embedchain.utils.misc import clean_string
  5. @register_deserializable
  6. class UnstructuredLoader(BaseLoader):
  7. def load_data(self, url):
  8. """Load data from an Unstructured file."""
  9. try:
  10. from langchain_community.document_loaders import \
  11. UnstructuredFileLoader
  12. except ImportError:
  13. raise ImportError(
  14. 'Unstructured file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`' # noqa: E501
  15. ) from None
  16. loader = UnstructuredFileLoader(url)
  17. data = []
  18. all_content = []
  19. pages = loader.load_and_split()
  20. if not len(pages):
  21. raise ValueError("No data found")
  22. for page in pages:
  23. content = page.page_content
  24. content = clean_string(content)
  25. meta_data = page.metadata
  26. meta_data["url"] = url
  27. data.append(
  28. {
  29. "content": content,
  30. "meta_data": meta_data,
  31. }
  32. )
  33. all_content.append(content)
  34. doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
  35. return {
  36. "doc_id": doc_id,
  37. "data": data,
  38. }