pdf_file.py 722 B

12345678910111213141516171819202122232425
  1. from langchain.document_loaders import PyPDFLoader
  2. from embedchain.utils import clean_string
  3. class PdfFileLoader:
  4. def load_data(self, url):
  5. """Load data from a PDF file."""
  6. loader = PyPDFLoader(url)
  7. output = []
  8. pages = loader.load_and_split()
  9. if not len(pages):
  10. raise ValueError("No data found")
  11. for page in pages:
  12. content = page.page_content
  13. content = clean_string(content)
  14. meta_data = page.metadata
  15. meta_data["url"] = url
  16. output.append(
  17. {
  18. "content": content,
  19. "meta_data": meta_data,
  20. }
  21. )
  22. return output