notion.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import logging
  2. import os
  3. try:
  4. from llama_index import download_loader
  5. except ImportError:
  6. raise ImportError(
  7. "Notion requires extra dependencies. Install with `pip install --upgrade embedchain[community]`"
  8. ) from None
  9. from embedchain.helper.json_serializable import register_deserializable
  10. from embedchain.loaders.base_loader import BaseLoader
  11. from embedchain.utils import clean_string
  12. @register_deserializable
  13. class NotionLoader(BaseLoader):
  14. def load_data(self, source):
  15. """Load data from a PDF file."""
  16. NotionPageReader = download_loader("NotionPageReader")
  17. # Reformat Id to match notion expectation
  18. id = source[-32:]
  19. formatted_id = f"{id[:8]}-{id[8:12]}-{id[12:16]}-{id[16:20]}-{id[20:]}"
  20. logging.debug(f"Extracted notion page id as: {formatted_id}")
  21. # Get page through the notion api
  22. integration_token = os.getenv("NOTION_INTEGRATION_TOKEN")
  23. reader = NotionPageReader(integration_token=integration_token)
  24. documents = reader.load_data(page_ids=[formatted_id])
  25. # Extract text
  26. raw_text = documents[0].text
  27. # Clean text
  28. text = clean_string(raw_text)
  29. return [
  30. {
  31. "content": text,
  32. "meta_data": {"url": f"notion-{formatted_id}"},
  33. }
  34. ]