xml.py 1.1 KB

12345678910111213141516171819202122232425262728293031
  1. import hashlib
  2. try:
  3. import unstructured # noqa: F401
  4. from langchain_community.document_loaders import UnstructuredXMLLoader
  5. except ImportError:
  6. raise ImportError(
  7. 'XML file requires extra dependencies. Install with `pip install "unstructured[local-inference, all-docs]"`'
  8. ) from None
  9. from embedchain.helpers.json_serializable import register_deserializable
  10. from embedchain.loaders.base_loader import BaseLoader
  11. from embedchain.utils.misc import clean_string
  12. @register_deserializable
  13. class XmlLoader(BaseLoader):
  14. def load_data(self, xml_url):
  15. """Load data from a XML file."""
  16. loader = UnstructuredXMLLoader(xml_url)
  17. data = loader.load()
  18. content = data[0].page_content
  19. content = clean_string(content)
  20. metadata = data[0].metadata
  21. metadata["url"] = metadata["source"]
  22. del metadata["source"]
  23. output = [{"content": content, "meta_data": metadata}]
  24. doc_id = hashlib.sha256((content + xml_url).encode()).hexdigest()
  25. return {
  26. "doc_id": doc_id,
  27. "data": output,
  28. }