notion.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. import logging
  2. import os
  3. try:
  4. from llama_index import download_loader
  5. except ImportError:
  6. raise ImportError("Notion requires extra dependencies. Install with `pip install embedchain[community]`") from None
  7. from embedchain.helper_classes.json_serializable import register_deserializable
  8. from embedchain.loaders.base_loader import BaseLoader
  9. from embedchain.utils import clean_string
  10. @register_deserializable
  11. class NotionLoader(BaseLoader):
  12. def load_data(self, source):
  13. """Load data from a PDF file."""
  14. NotionPageReader = download_loader("NotionPageReader")
  15. # Reformat Id to match notion expectation
  16. id = source[-32:]
  17. formatted_id = f"{id[:8]}-{id[8:12]}-{id[12:16]}-{id[16:20]}-{id[20:]}"
  18. logging.debug(f"Extracted notion page id as: {formatted_id}")
  19. # Get page through the notion api
  20. integration_token = os.getenv("NOTION_INTEGRATION_TOKEN")
  21. reader = NotionPageReader(integration_token=integration_token)
  22. documents = reader.load_data(page_ids=[formatted_id])
  23. # Extract text
  24. raw_text = documents[0].text
  25. # Clean text
  26. text = clean_string(raw_text)
  27. return [
  28. {
  29. "content": text,
  30. "meta_data": {"url": f"notion-{formatted_id}"},
  31. }
  32. ]