pdf_file.py 1.1 KB

123456789101112131415161718192021222324252627282930313233343536
  1. import hashlib
  2. from langchain.document_loaders import PyPDFLoader
  3. from embedchain.helper.json_serializable import register_deserializable
  4. from embedchain.loaders.base_loader import BaseLoader
  5. from embedchain.utils import clean_string
  6. @register_deserializable
  7. class PdfFileLoader(BaseLoader):
  8. def load_data(self, url):
  9. """Load data from a PDF file."""
  10. loader = PyPDFLoader(url)
  11. data = []
  12. all_content = []
  13. pages = loader.load_and_split()
  14. if not len(pages):
  15. raise ValueError("No data found")
  16. for page in pages:
  17. content = page.page_content
  18. content = clean_string(content)
  19. meta_data = page.metadata
  20. meta_data["url"] = url
  21. data.append(
  22. {
  23. "content": content,
  24. "meta_data": meta_data,
  25. }
  26. )
  27. all_content.append(content)
  28. doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
  29. return {
  30. "doc_id": doc_id,
  31. "data": data,
  32. }