2 years ago · ce6eb39009
--- a/docs/advanced/data_types.mdx
+++ b/docs/advanced/data_types.mdx
@@ -54,6 +54,18 @@ To add any code documentation website as a loader, use the data_type as `docs_si
 
				 app.add("docs_site", "https://docs.embedchain.ai/")
			
 
				 ```
			
 
				 
			
 
				+### Notion
			
 
				+To use notion you must install the extra dependencies with `pip install embedchain[notion]`.
			
 
				+
			
 
				+To load a notion page, use the data_type as `notion`.
			
 
				+The next argument must **end** with the `notion page id`. The id is a 32-character string. Eg:
			
 
				+
			
 
				+```python
			
 
				+app.add("notion", "cfbc134ca6464fc980d0391613959196")
			
 
				+app.add("notion", "my-page-cfbc134ca6464fc980d0391613959196")
			
 
				+app.add("notion", "https://www.notion.so/my-page-cfbc134ca6464fc980d0391613959196")
			
 
				+```
			
 
				+
			
 
				 ### Text
			
 
				 
			
 
				 To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg:
			
--- a/docs/advanced/query_configuration.mdx
+++ b/docs/advanced/query_configuration.mdx
@@ -46,6 +46,7 @@ Default values of chunker config parameters for different `data_type`:
 
				 |pdf_file|1000|0|len|
			
 
				 |youtube_video|2000|0|len|
			
 
				 |docs_site|500|50|len|
			
 
				+|notion|300|0|len|
			
 
				 
			
 
				 ### LoaderConfig
			
 
				 
			
--- a/embedchain/chunkers/notion.py
+++ b/embedchain/chunkers/notion.py
@@ -0,0 +1,20 @@
 
				+from typing import Optional
			
 
				+
			
 
				+from langchain.text_splitter import RecursiveCharacterTextSplitter
			
 
				+
			
 
				+from embedchain.chunkers.base_chunker import BaseChunker
			
 
				+from embedchain.config.AddConfig import ChunkerConfig
			
 
				+
			
 
				+
			
 
				+class NotionChunker(BaseChunker):
			
 
				+    """Chunker for notion."""
			
 
				+
			
 
				+    def __init__(self, config: Optional[ChunkerConfig] = None):
			
 
				+        if config is None:
			
 
				+            config = ChunkerConfig(chunk_size=300, chunk_overlap=0, length_function=len)
			
 
				+        text_splitter = RecursiveCharacterTextSplitter(
			
 
				+            chunk_size=config.chunk_size,
			
 
				+            chunk_overlap=config.chunk_overlap,
			
 
				+            length_function=config.length_function,
			
 
				+        )
			
 
				+        super().__init__(text_splitter)
			
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -1,5 +1,6 @@
 
				 from embedchain.chunkers.docs_site import DocsSiteChunker
			
 
				 from embedchain.chunkers.docx_file import DocxFileChunker
			
 
				+from embedchain.chunkers.notion import NotionChunker
			
 
				 from embedchain.chunkers.pdf_file import PdfFileChunker
			
 
				 from embedchain.chunkers.qna_pair import QnaPairChunker
			
 
				 from embedchain.chunkers.text import TextChunker
			
@@ -10,6 +11,7 @@ from embedchain.loaders.docs_site_loader import DocsSiteLoader
 
				 from embedchain.loaders.docx_file import DocxFileLoader
			
 
				 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
			
 
				 from embedchain.loaders.local_text import LocalTextLoader
			
 
				+from embedchain.loaders.notion import NotionLoader
			
 
				 from embedchain.loaders.pdf_file import PdfFileLoader
			
 
				 from embedchain.loaders.sitemap import SitemapLoader
			
 
				 from embedchain.loaders.web_page import WebPageLoader
			
@@ -44,6 +46,7 @@ class DataFormatter:
 
				             "docx": DocxFileLoader(),
			
 
				             "sitemap": SitemapLoader(),
			
 
				             "docs_site": DocsSiteLoader(),
			
 
				+            "notion": NotionLoader(),
			
 
				         }
			
 
				         if data_type in loaders:
			
 
				             return loaders[data_type]
			
@@ -67,6 +70,7 @@ class DataFormatter:
 
				             "docx": DocxFileChunker,
			
 
				             "sitemap": WebPageChunker,
			
 
				             "docs_site": DocsSiteChunker,
			
 
				+            "notion": NotionChunker,
			
 
				         }
			
 
				         if data_type in chunker_classes:
			
 
				             chunker_class = chunker_classes[data_type]
			
--- a/embedchain/loaders/notion.py
+++ b/embedchain/loaders/notion.py
@@ -0,0 +1,41 @@
 
				+import logging
			
 
				+import os
			
 
				+
			
 
				+try:
			
 
				+    from llama_index import download_loader
			
 
				+except ImportError:
			
 
				+    raise ImportError("Notion requires extra dependencies. Install with `pip install embedchain[notion]`") from None
			
 
				+
			
 
				+
			
 
				+from embedchain.loaders.base_loader import BaseLoader
			
 
				+from embedchain.utils import clean_string
			
 
				+
			
 
				+
			
 
				+class NotionLoader(BaseLoader):
			
 
				+    def load_data(self, source):
			
 
				+        """Load data from a PDF file."""
			
 
				+
			
 
				+        NotionPageReader = download_loader("NotionPageReader")
			
 
				+
			
 
				+        # Reformat Id to match notion expectation
			
 
				+        id = source[-32:]
			
 
				+        formatted_id = f"{id[:8]}-{id[8:12]}-{id[12:16]}-{id[16:20]}-{id[20:]}"
			
 
				+        logging.debug(f"Extracted notion page id as: {formatted_id}")
			
 
				+
			
 
				+        # Get page through the notion api
			
 
				+        integration_token = os.getenv("NOTION_INTEGRATION_TOKEN")
			
 
				+        reader = NotionPageReader(integration_token=integration_token)
			
 
				+        documents = reader.load_data(page_ids=[formatted_id])
			
 
				+
			
 
				+        # Extract text
			
 
				+        raw_text = documents[0].text
			
 
				+
			
 
				+        # Clean text
			
 
				+        text = clean_string(raw_text)
			
 
				+
			
 
				+        return [
			
 
				+            {
			
 
				+                "content": text,
			
 
				+                "meta_data": {"url": f"notion-{formatted_id}"},
			
 
				+            }
			
 
				+        ]
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -90,6 +90,7 @@ youtube-transcript-api = "^0.6.1"
 
				 beautifulsoup4 = "^4.12.2"
			
 
				 pypdf = "^3.11.0"
			
 
				 pytube = "^15.0.0"
			
 
				+llama-index = { version = "^0.7.21", optional = true }
			
 
				 
			
 
				 
			
 
				 
			
@@ -105,7 +106,7 @@ isort = "^5.12.0"
 
				 
			
 
				 [tool.poetry.extras]
			
 
				 streamlit = ["streamlit"]
			
 
				-
			
 
				+community = ["llama-index"]
			
 
				 
			
 
				 [tool.poetry.group.docs.dependencies]
			
 
				 
			
--- a/setup.py
+++ b/setup.py
@@ -37,5 +37,5 @@ setuptools.setup(
 
				         "replicate==0.9.0",
			
 
				         "duckduckgo-search==3.8.4",
			
 
				     ],
			
 
				-    extras_require={"dev": ["black", "ruff", "isort", "pytest"]},
			
 
				+    extras_require={"dev": ["black", "ruff", "isort", "pytest"], "community": ["llama-index==0.7.21"]},
			
 
				 )