notion.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. import logging
  2. import os
  3. try:
  4. from llama_index import download_loader
  5. except ImportError:
  6. raise ImportError("Notion requires extra dependencies. Install with `pip install embedchain[community]`") from None
  7. from embedchain.loaders.base_loader import BaseLoader
  8. from embedchain.utils import clean_string
  9. class NotionLoader(BaseLoader):
  10. def load_data(self, source):
  11. """Load data from a PDF file."""
  12. NotionPageReader = download_loader("NotionPageReader")
  13. # Reformat Id to match notion expectation
  14. id = source[-32:]
  15. formatted_id = f"{id[:8]}-{id[8:12]}-{id[12:16]}-{id[16:20]}-{id[20:]}"
  16. logging.debug(f"Extracted notion page id as: {formatted_id}")
  17. # Get page through the notion api
  18. integration_token = os.getenv("NOTION_INTEGRATION_TOKEN")
  19. reader = NotionPageReader(integration_token=integration_token)
  20. documents = reader.load_data(page_ids=[formatted_id])
  21. # Extract text
  22. raw_text = documents[0].text
  23. # Clean text
  24. text = clean_string(raw_text)
  25. return [
  26. {
  27. "content": text,
  28. "meta_data": {"url": f"notion-{formatted_id}"},
  29. }
  30. ]