pdf_file.py 644 B

1234567891011121314151617181920212223
  1. from langchain.document_loaders import PyPDFLoader
  2. from embedchain.utils import clean_string
  3. class PdfFileLoader:
  4. def load_data(self, url):
  5. loader = PyPDFLoader(url)
  6. output = []
  7. pages = loader.load_and_split()
  8. if not len(pages):
  9. raise ValueError("No data found")
  10. for page in pages:
  11. content = page.page_content
  12. content = clean_string(content)
  13. meta_data = page.metadata
  14. meta_data["url"] = url
  15. output.append({
  16. "content": content,
  17. "meta_data": meta_data,
  18. })
  19. return output