pdf_file.py 788 B

1234567891011121314151617181920212223242526
  1. from langchain.document_loaders import PyPDFLoader
  2. from embedchain.loaders.base_loader import BaseLoader
  3. from embedchain.utils import clean_string
  4. class PdfFileLoader(BaseLoader):
  5. def load_data(self, url):
  6. """Load data from a PDF file."""
  7. loader = PyPDFLoader(url)
  8. output = []
  9. pages = loader.load_and_split()
  10. if not len(pages):
  11. raise ValueError("No data found")
  12. for page in pages:
  13. content = page.page_content
  14. content = clean_string(content)
  15. meta_data = page.metadata
  16. meta_data["url"] = url
  17. output.append(
  18. {
  19. "content": content,
  20. "meta_data": meta_data,
  21. }
  22. )
  23. return output