pdf_file.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. import hashlib
  2. try:
  3. from langchain_community.document_loaders import PyPDFLoader
  4. except ImportError:
  5. raise ImportError(
  6. 'PDF File requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
  7. ) from None
  8. from embedchain.helpers.json_serializable import register_deserializable
  9. from embedchain.loaders.base_loader import BaseLoader
  10. from embedchain.utils.misc import clean_string
  11. @register_deserializable
  12. class PdfFileLoader(BaseLoader):
  13. def load_data(self, url):
  14. """Load data from a PDF file."""
  15. headers = {
  16. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", # noqa:E501
  17. }
  18. loader = PyPDFLoader(url, headers=headers)
  19. data = []
  20. all_content = []
  21. pages = loader.load_and_split()
  22. if not len(pages):
  23. raise ValueError("No data found")
  24. for page in pages:
  25. content = page.page_content
  26. content = clean_string(content)
  27. meta_data = page.metadata
  28. meta_data["url"] = url
  29. data.append(
  30. {
  31. "content": content,
  32. "meta_data": meta_data,
  33. }
  34. )
  35. all_content.append(content)
  36. doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
  37. return {
  38. "doc_id": doc_id,
  39. "data": data,
  40. }