pdf_file.py 893 B

12345678910111213141516171819202122232425262728
  1. from langchain.document_loaders import PyPDFLoader
  2. from embedchain.helper_classes.json_serializable import register_deserializable
  3. from embedchain.loaders.base_loader import BaseLoader
  4. from embedchain.utils import clean_string
  5. @register_deserializable
  6. class PdfFileLoader(BaseLoader):
  7. def load_data(self, url):
  8. """Load data from a PDF file."""
  9. loader = PyPDFLoader(url)
  10. output = []
  11. pages = loader.load_and_split()
  12. if not len(pages):
  13. raise ValueError("No data found")
  14. for page in pages:
  15. content = page.page_content
  16. content = clean_string(content)
  17. meta_data = page.metadata
  18. meta_data["url"] = url
  19. output.append(
  20. {
  21. "content": content,
  22. "meta_data": meta_data,
  23. }
  24. )
  25. return output