xml.py 872 B

1234567891011121314151617181920212223242526
  1. import hashlib
  2. from langchain.document_loaders import UnstructuredXMLLoader
  3. from embedchain.helper.json_serializable import register_deserializable
  4. from embedchain.loaders.base_loader import BaseLoader
  5. from embedchain.utils import clean_string
  6. @register_deserializable
  7. class XmlLoader(BaseLoader):
  8. def load_data(self, xml_url):
  9. """Load data from a XML file."""
  10. loader = UnstructuredXMLLoader(xml_url)
  11. data = loader.load()
  12. content = data[0].page_content
  13. content = clean_string(content)
  14. meta_data = data[0].metadata
  15. meta_data["url"] = meta_data["source"]
  16. del meta_data["source"]
  17. output = [{"content": content, "meta_data": meta_data}]
  18. doc_id = hashlib.sha256((content + xml_url).encode()).hexdigest()
  19. return {
  20. "doc_id": doc_id,
  21. "data": output,
  22. }