mdx.py 796 B

12345678910111213141516171819202122232425262728
  1. import hashlib
  2. from langchain.document_loaders import PyPDFLoader
  3. from embedchain.helper.json_serializable import register_deserializable
  4. from embedchain.loaders.base_loader import BaseLoader
  5. from embedchain.utils import clean_string
  6. @register_deserializable
  7. class MdxLoader(BaseLoader):
  8. def load_data(self, url):
  9. """Load data from a mdx file."""
  10. with open(url, 'r', encoding="utf-8") as infile:
  11. content = infile.read()
  12. meta_data = {
  13. "url": url,
  14. }
  15. doc_id = hashlib.sha256((content + url).encode()).hexdigest()
  16. return {
  17. "doc_id": doc_id,
  18. "data": [
  19. {
  20. "content": content,
  21. "meta_data": meta_data,
  22. }
  23. ],
  24. }