pdf_file.py 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. import hashlib
  2. from langchain_community.document_loaders import PyPDFLoader
  3. from embedchain.helpers.json_serializable import register_deserializable
  4. from embedchain.loaders.base_loader import BaseLoader
  5. from embedchain.utils.misc import clean_string
  6. @register_deserializable
  7. class PdfFileLoader(BaseLoader):
  8. def load_data(self, url):
  9. """Load data from a PDF file."""
  10. headers = {
  11. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", # noqa:E501
  12. }
  13. loader = PyPDFLoader(url, headers=headers)
  14. data = []
  15. all_content = []
  16. pages = loader.load_and_split()
  17. if not len(pages):
  18. raise ValueError("No data found")
  19. for page in pages:
  20. content = page.page_content
  21. content = clean_string(content)
  22. metadata = page.metadata
  23. metadata["url"] = url
  24. data.append(
  25. {
  26. "content": content,
  27. "meta_data": metadata,
  28. }
  29. )
  30. all_content.append(content)
  31. doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
  32. return {
  33. "doc_id": doc_id,
  34. "data": data,
  35. }