xml.py 1.0 KB

123456789101112131415161718192021222324252627282930
  1. import hashlib
  2. try:
  3. from langchain_community.document_loaders import UnstructuredXMLLoader
  4. except ImportError:
  5. raise ImportError(
  6. 'XML file requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
  7. ) from None
  8. from embedchain.helpers.json_serializable import register_deserializable
  9. from embedchain.loaders.base_loader import BaseLoader
  10. from embedchain.utils.misc import clean_string
  11. @register_deserializable
  12. class XmlLoader(BaseLoader):
  13. def load_data(self, xml_url):
  14. """Load data from a XML file."""
  15. loader = UnstructuredXMLLoader(xml_url)
  16. data = loader.load()
  17. content = data[0].page_content
  18. content = clean_string(content)
  19. meta_data = data[0].metadata
  20. meta_data["url"] = meta_data["source"]
  21. del meta_data["source"]
  22. output = [{"content": content, "meta_data": meta_data}]
  23. doc_id = hashlib.sha256((content + xml_url).encode()).hexdigest()
  24. return {
  25. "doc_id": doc_id,
  26. "data": output,
  27. }