pdf_file.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. import hashlib
  2. try:
  3. from langchain.document_loaders import PyPDFLoader
  4. except ImportError:
  5. raise ImportError(
  6. 'PDF File requires extra dependencies. Install with `pip install --upgrade "embedchain[dataloaders]"`'
  7. ) from None
  8. from embedchain.helper.json_serializable import register_deserializable
  9. from embedchain.loaders.base_loader import BaseLoader
  10. from embedchain.utils import clean_string
  11. @register_deserializable
  12. class PdfFileLoader(BaseLoader):
  13. def load_data(self, url):
  14. """Load data from a PDF file."""
  15. loader = PyPDFLoader(url)
  16. data = []
  17. all_content = []
  18. pages = loader.load_and_split()
  19. if not len(pages):
  20. raise ValueError("No data found")
  21. for page in pages:
  22. content = page.page_content
  23. content = clean_string(content)
  24. meta_data = page.metadata
  25. meta_data["url"] = url
  26. data.append(
  27. {
  28. "content": content,
  29. "meta_data": meta_data,
  30. }
  31. )
  32. all_content.append(content)
  33. doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
  34. return {
  35. "doc_id": doc_id,
  36. "data": data,
  37. }