notion.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. import hashlib
  2. import logging
  3. import os
  4. try:
  5. from llama_hub.notion.base import NotionPageReader
  6. except ImportError:
  7. raise ImportError(
  8. "Notion requires extra dependencies. Install with `pip install --upgrade embedchain[community]`"
  9. ) from None
  10. from embedchain.helper.json_serializable import register_deserializable
  11. from embedchain.loaders.base_loader import BaseLoader
  12. from embedchain.utils import clean_string
  13. @register_deserializable
  14. class NotionLoader(BaseLoader):
  15. def load_data(self, source):
  16. """Load data from a PDF file."""
  17. # Reformat Id to match notion expectation
  18. id = source[-32:]
  19. formatted_id = f"{id[:8]}-{id[8:12]}-{id[12:16]}-{id[16:20]}-{id[20:]}"
  20. logging.debug(f"Extracted notion page id as: {formatted_id}")
  21. # Get page through the notion api
  22. integration_token = os.getenv("NOTION_INTEGRATION_TOKEN")
  23. reader = NotionPageReader(integration_token=integration_token)
  24. documents = reader.load_data(page_ids=[formatted_id])
  25. # Extract text
  26. raw_text = documents[0].text
  27. # Clean text
  28. text = clean_string(raw_text)
  29. doc_id = hashlib.sha256((text + source).encode()).hexdigest()
  30. return {
  31. "doc_id": doc_id,
  32. "data": [
  33. {
  34. "content": text,
  35. "meta_data": {"url": f"notion-{formatted_id}"},
  36. }
  37. ],
  38. }