Ver código fonte

[Bug fix] Fix issue with gmail loader (#1228)

Deshraj Yadav 1 ano atrás
pai
commit
2985b667b0

+ 15 - 0
docs/api-reference/app/chat.mdx

@@ -129,3 +129,18 @@ app.chat("What is the net worth of Bill Gates?", session_id="user2")
 app.chat("What was my last question", session_id="user1")
 # 'Your last question was "What is the net worth of Elon Musk?"'
 ```
+
+### With custom context window
+
+If you want to customize the context window that you want to use during chat (default context window is 3 document chunks), you can do using the following code snippet:
+
+```python with custom chunks size
+from embedchain import App
+from embedchain.config import BaseLlmConfig
+
+app = App()
+app.add("https://www.forbes.com/profile/elon-musk")
+
+query_config = BaseLlmConfig(number_documents=5)
+app.chat("What is the net worth of Elon Musk?", config=query_config)
+```

+ 4 - 3
docs/components/data-sources/custom.mdx

@@ -7,11 +7,12 @@ When we say "custom", we mean that you can customize the loader and chunker to y
 ```python
 from embedchain import App
 import your_loader
-import your_chunker
+from my_module import CustomLoader
+from my_module import CustomChunker
 
 app = App()
-loader = your_loader()
-chunker = your_chunker()
+loader = CustomLoader()
+chunker = CustomChunker()
 
 app.add("source", data_type="custom", loader=loader, chunker=chunker)
 ```

+ 8 - 5
embedchain/chunkers/base_chunker.py

@@ -39,11 +39,14 @@ class BaseChunker(JSONSerializable):
         for data in data_records:
             content = data["content"]
 
-            meta_data = data["meta_data"]
+            metadata = data["meta_data"]
             # add data type to meta data to allow query using data type
-            meta_data["data_type"] = self.data_type.value
-            meta_data["doc_id"] = doc_id
-            url = meta_data["url"]
+            metadata["data_type"] = self.data_type.value
+            metadata["doc_id"] = doc_id
+
+            # TODO: Currently defaulting to the src as the url. This is done intentianally since some
+            # of the data types like 'gmail' loader doesn't have the url in the meta data.
+            url = metadata.get("url", src)
 
             chunks = self.get_chunks(content)
             for chunk in chunks:
@@ -53,7 +56,7 @@ class BaseChunker(JSONSerializable):
                     id_map[chunk_id] = True
                     chunk_ids.append(chunk_id)
                     documents.append(chunk)
-                    metadatas.append(meta_data)
+                    metadatas.append(metadata)
         return {
             "documents": documents,
             "ids": chunk_ids,

+ 1 - 1
pyproject.toml

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "embedchain"
-version = "0.1.71"
+version = "0.1.72"
 description = "Simplest open source retrieval(RAG) framework"
 authors = [
     "Taranjeet Singh <taranjeet@embedchain.ai>",