Ver Fonte

[Feature] Add support for hybrid search for pinecone vector database (#1259)

Deshraj Yadav há 1 ano atrás
pai
commit
38b4e06963

+ 1 - 1
docs/_snippets/missing-vector-db-tip.mdx

@@ -1,6 +1,6 @@
 
 
-<p>If you can't find the specific vector database, please feel free to request through one of the following channels and help us prioritize.</p>
+<p>If you can't find specific feature or run into issues, please feel free to reach out through one of the following channels.</p>
 
 <CardGroup cols={2}>
   <Card title="Slack" icon="slack" href="https://embedchain.ai/slack" color="#4A154B">

+ 1 - 1
docs/components/data-sources/google-drive.mdx

@@ -25,4 +25,4 @@ app = App()
 
 url = "https://drive.google.com/drive/u/0/folders/xxx-xxx"
 app.add(url, data_type="google_drive")
-```
+```

+ 29 - 28
docs/components/data-sources/overview.mdx

@@ -5,34 +5,35 @@ title: Overview
 Embedchain comes with built-in support for various data sources. We handle the complexity of loading unstructured data from these data sources, allowing you to easily customize your app through a user-friendly interface.
 
 <CardGroup cols={4}>
-  <Card title="📰 PDF file" href="/components/data-sources/pdf-file"></Card>
-  <Card title="📊 CSV file" href="/components/data-sources/csv"></Card>
-  <Card title="📃 JSON file" href="/components/data-sources/json"></Card>
-  <Card title="📝 Text" href="/components/data-sources/text"></Card>
-  <Card title="📁 Directory/ Folder" href="/components/data-sources/directory"></Card>
-  <Card title="🌐 HTML Web page" href="/components/data-sources/web-page"></Card>
-  <Card title="📽️ Youtube Channel" href="/components/data-sources/youtube-channel"></Card>
-  <Card title="📺 Youtube Video" href="/components/data-sources/youtube-video"></Card>
-  <Card title="📚 Docs website" href="/components/data-sources/docs-site"></Card>
-  <Card title="📝 MDX file" href="/components/data-sources/mdx"></Card>
-  <Card title="📄 DOCX file" href="/components/data-sources/docx"></Card>
-  <Card title="📓 Notion" href="/components/data-sources/notion"></Card>
-  <Card title="🗺️ Sitemap" href="/components/data-sources/sitemap"></Card>
-  <Card title="🧾 XML file" href="/components/data-sources/xml"></Card>
-  <Card title="❓💬 Q&A pair" href="/components/data-sources/qna"></Card>
-  <Card title="🙌 OpenAPI" href="/components/data-sources/openapi"></Card>
-  <Card title="📬 Gmail" href="/components/data-sources/gmail"></Card>
-  <Card title="📝 Github" href="/components/data-sources/github"></Card>
-  <Card title="🐘 Postgres" href="/components/data-sources/postgres"></Card>
-  <Card title="🐬 MySQL" href="/components/data-sources/mysql"></Card>
-  <Card title="🤖 Slack" href="/components/data-sources/slack"></Card>
-  <Card title="💬 Discord" href="/components/data-sources/discord"></Card>
-  <Card title="🗨️ Discourse" href="/components/data-sources/discourse"></Card>
-  <Card title="📝 Substack" href="/components/data-sources/substack"></Card>
-  <Card title="🐝 Beehiiv" href="/components/data-sources/beehiiv"></Card>
-  <Card title="💾 Dropbox" href="/components/data-sources/dropbox"></Card>
-  <Card title="🖼️ Image" href="/components/data-sources/image"></Card>
-  <Card title="⚙️ Custom" href="/components/data-sources/custom"></Card>
+  <Card title="PDF file" href="/components/data-sources/pdf-file"></Card>
+  <Card title="CSV file" href="/components/data-sources/csv"></Card>
+  <Card title="JSON file" href="/components/data-sources/json"></Card>
+  <Card title="Text" href="/components/data-sources/text"></Card>
+  <Card title="Directory" href="/components/data-sources/directory"></Card>
+  <Card title="Web page" href="/components/data-sources/web-page"></Card>
+  <Card title="Youtube Channel" href="/components/data-sources/youtube-channel"></Card>
+  <Card title="Youtube Video" href="/components/data-sources/youtube-video"></Card>
+  <Card title="Docs website" href="/components/data-sources/docs-site"></Card>
+  <Card title="MDX file" href="/components/data-sources/mdx"></Card>
+  <Card title="DOCX file" href="/components/data-sources/docx"></Card>
+  <Card title="Notion" href="/components/data-sources/notion"></Card>
+  <Card title="Sitemap" href="/components/data-sources/sitemap"></Card>
+  <Card title="XML file" href="/components/data-sources/xml"></Card>
+  <Card title="Q&A pair" href="/components/data-sources/qna"></Card>
+  <Card title="OpenAPI" href="/components/data-sources/openapi"></Card>
+  <Card title="Gmail" href="/components/data-sources/gmail"></Card>
+  <Card title="Google Drive" href="/components/data-sources/google-drive"></Card>
+  <Card title="GitHub" href="/components/data-sources/github"></Card>
+  <Card title="Postgres" href="/components/data-sources/postgres"></Card>
+  <Card title="MySQL" href="/components/data-sources/mysql"></Card>
+  <Card title="Slack" href="/components/data-sources/slack"></Card>
+  <Card title="Discord" href="/components/data-sources/discord"></Card>
+  <Card title="Discourse" href="/components/data-sources/discourse"></Card>
+  <Card title="Substack" href="/components/data-sources/substack"></Card>
+  <Card title="Beehiiv" href="/components/data-sources/beehiiv"></Card>
+  <Card title="Dropbox" href="/components/data-sources/dropbox"></Card>
+  <Card title="Image" href="/components/data-sources/image"></Card>
+  <Card title="Custom" href="/components/data-sources/custom"></Card>
 </CardGroup>
 
 <br/ >

+ 0 - 238
docs/components/vector-databases.mdx

@@ -17,242 +17,4 @@ Utilizing a vector database alongside Embedchain is a seamless process. All you
   <Card title="Weaviate" href="#weaviate"></Card>
 </CardGroup>
 
-## ChromaDB
-
-<CodeGroup>
-
-```python main.py
-from embedchain import App
-
-# load chroma configuration from yaml file
-app = App.from_config(config_path="config1.yaml")
-```
-
-```yaml config1.yaml
-vectordb:
-  provider: chroma
-  config:
-    collection_name: 'my-collection'
-    dir: db
-    allow_reset: true
-```
-
-```yaml config2.yaml
-vectordb:
-  provider: chroma
-  config:
-    collection_name: 'my-collection'
-    host: localhost
-    port: 5200
-    allow_reset: true
-```
-
-</CodeGroup>
-
-
-## Elasticsearch
-
-Install related dependencies using the following command:
-
-```bash
-pip install --upgrade 'embedchain[elasticsearch]'
-```
-
-<Note>
-You can configure the Elasticsearch connection by providing either `es_url` or `cloud_id`. If you are using the Elasticsearch Service on Elastic Cloud, you can find the `cloud_id` on the [Elastic Cloud dashboard](https://cloud.elastic.co/deployments).
-</Note>
-
-You can authorize the connection to Elasticsearch by providing either `basic_auth`, `api_key`, or `bearer_auth`.
-
-<CodeGroup>
-
-```python main.py
-from embedchain import App
-
-# load elasticsearch configuration from yaml file
-app = App.from_config(config_path="config.yaml")
-```
-
-```yaml config.yaml
-vectordb:
-  provider: elasticsearch
-  config:
-    collection_name: 'es-index'
-    cloud_id: 'deployment-name:xxxx'
-    basic_auth:
-      - elastic
-      - <your_password>
-    verify_certs: false
-```
-</CodeGroup>
-
-## OpenSearch
-
-Install related dependencies using the following command:
-
-```bash
-pip install --upgrade 'embedchain[opensearch]'
-```
-
-<CodeGroup>
-
-```python main.py
-from embedchain import App
-
-# load opensearch configuration from yaml file
-app = App.from_config(config_path="config.yaml")
-```
-
-```yaml config.yaml
-vectordb:
-  provider: opensearch
-  config:
-    collection_name: 'my-app'
-    opensearch_url: 'https://localhost:9200'
-    http_auth:
-      - admin
-      - admin
-    vector_dimension: 1536
-    use_ssl: false
-    verify_certs: false
-```
-
-</CodeGroup>
-
-## Zilliz
-
-Install related dependencies using the following command:
-
-```bash
-pip install --upgrade 'embedchain[milvus]'
-```
-
-Set the Zilliz environment variables `ZILLIZ_CLOUD_URI` and `ZILLIZ_CLOUD_TOKEN` which you can find it on their [cloud platform](https://cloud.zilliz.com/).
-
-<CodeGroup>
-
-```python main.py
-import os
-from embedchain import App
-
-os.environ['ZILLIZ_CLOUD_URI'] = 'https://xxx.zillizcloud.com'
-os.environ['ZILLIZ_CLOUD_TOKEN'] = 'xxx'
-
-# load zilliz configuration from yaml file
-app = App.from_config(config_path="config.yaml")
-```
-
-```yaml config.yaml
-vectordb:
-  provider: zilliz
-  config:
-    collection_name: 'zilliz_app'
-    uri: https://xxxx.api.gcp-region.zillizcloud.com
-    token: xxx
-    vector_dim: 1536
-    metric_type: L2
-```
-
-</CodeGroup>
-
-## LanceDB
-
-_Coming soon_
-
-## Pinecone
-
-Install pinecone related dependencies using the following command:
-
-```bash
-pip install --upgrade 'embedchain[pinecone]'
-```
-
-In order to use Pinecone as vector database, set the environment variable `PINECONE_API_KEY` which you can find on [Pinecone dashboard](https://app.pinecone.io/).
-
-<CodeGroup>
-
-```python main.py
-from embedchain import App
-
-# load pinecone configuration from yaml file
-app = App.from_config(config_path="pod_config.yaml")
-# or
-app = App.from_config(config_path="serverless_config.yaml")
-```
-
-```yaml pod_config.yaml
-vectordb:
-  provider: pinecone
-  config:
-    metric: cosine
-    vector_dimension: 1536
-    index_name: my-pinecone-index
-    pod_config:
-      environment: gcp-starter
-      metadata_config:
-        indexed:
-          - "url"
-          - "hash"
-```
-
-```yaml serverless_config.yaml
-vectordb:
-  provider: pinecone
-  config:
-    metric: cosine
-    vector_dimension: 1536
-    index_name: my-pinecone-index
-    serverless_config:
-      cloud: aws
-      region: us-west-2
-```
-
-</CodeGroup>
-
-<br />
-<Note>
-You can find more information about Pinecone configuration [here](https://docs.pinecone.io/docs/manage-indexes#create-a-pod-based-index).
-You can also optionally provide `index_name` as a config param in yaml file to specify the index name. If not provided, the index name will be `{collection_name}-{vector_dimension}`.
-</Note>
-
-## Qdrant
-
-In order to use Qdrant as a vector database, set the environment variables `QDRANT_URL` and `QDRANT_API_KEY` which you can find on [Qdrant Dashboard](https://cloud.qdrant.io/).
-
-<CodeGroup>
-```python main.py
-from embedchain import App
-
-# load qdrant configuration from yaml file
-app = App.from_config(config_path="config.yaml")
-```
-
-```yaml config.yaml
-vectordb:
-  provider: qdrant
-  config:
-    collection_name: my_qdrant_index
-```
-</CodeGroup>
-
-## Weaviate
-
-In order to use Weaviate as a vector database, set the environment variables `WEAVIATE_ENDPOINT` and `WEAVIATE_API_KEY` which you can find on [Weaviate dashboard](https://console.weaviate.cloud/dashboard).
-
-<CodeGroup>
-```python main.py
-from embedchain import App
-
-# load weaviate configuration from yaml file
-app = App.from_config(config_path="config.yaml")
-```
-
-```yaml config.yaml
-vectordb:
-  provider: weaviate
-  config:
-    collection_name: my_weaviate_index
-```
-</CodeGroup>
-
 <Snippet file="missing-vector-db-tip.mdx" />

+ 35 - 0
docs/components/vector-databases/chromadb.mdx

@@ -0,0 +1,35 @@
+---
+title: ChromaDB
+---
+
+<CodeGroup>
+
+```python main.py
+from embedchain import App
+
+# load chroma configuration from yaml file
+app = App.from_config(config_path="config1.yaml")
+```
+
+```yaml config1.yaml
+vectordb:
+  provider: chroma
+  config:
+    collection_name: 'my-collection'
+    dir: db
+    allow_reset: true
+```
+
+```yaml config2.yaml
+vectordb:
+  provider: chroma
+  config:
+    collection_name: 'my-collection'
+    host: localhost
+    port: 5200
+    allow_reset: true
+```
+
+</CodeGroup>
+
+<Snippet file="missing-vector-db-tip.mdx" />

+ 39 - 0
docs/components/vector-databases/elasticsearch.mdx

@@ -0,0 +1,39 @@
+---
+title: Elasticsearch
+---
+
+Install related dependencies using the following command:
+
+```bash
+pip install --upgrade 'embedchain[elasticsearch]'
+```
+
+<Note>
+You can configure the Elasticsearch connection by providing either `es_url` or `cloud_id`. If you are using the Elasticsearch Service on Elastic Cloud, you can find the `cloud_id` on the [Elastic Cloud dashboard](https://cloud.elastic.co/deployments).
+</Note>
+
+You can authorize the connection to Elasticsearch by providing either `basic_auth`, `api_key`, or `bearer_auth`.
+
+<CodeGroup>
+
+```python main.py
+from embedchain import App
+
+# load elasticsearch configuration from yaml file
+app = App.from_config(config_path="config.yaml")
+```
+
+```yaml config.yaml
+vectordb:
+  provider: elasticsearch
+  config:
+    collection_name: 'es-index'
+    cloud_id: 'deployment-name:xxxx'
+    basic_auth:
+      - elastic
+      - <your_password>
+    verify_certs: false
+```
+</CodeGroup>
+
+<Snippet file="missing-vector-db-tip.mdx" />

+ 36 - 0
docs/components/vector-databases/opensearch.mdx

@@ -0,0 +1,36 @@
+---
+title: OpenSearch
+---
+
+Install related dependencies using the following command:
+
+```bash
+pip install --upgrade 'embedchain[opensearch]'
+```
+
+<CodeGroup>
+
+```python main.py
+from embedchain import App
+
+# load opensearch configuration from yaml file
+app = App.from_config(config_path="config.yaml")
+```
+
+```yaml config.yaml
+vectordb:
+  provider: opensearch
+  config:
+    collection_name: 'my-app'
+    opensearch_url: 'https://localhost:9200'
+    http_auth:
+      - admin
+      - admin
+    vector_dimension: 1536
+    use_ssl: false
+    verify_certs: false
+```
+
+</CodeGroup>
+
+<Snippet file="missing-vector-db-tip.mdx" />

+ 106 - 0
docs/components/vector-databases/pinecone.mdx

@@ -0,0 +1,106 @@
+---
+title: Pinecone
+---
+
+## Overview
+
+Install pinecone related dependencies using the following command:
+
+```bash
+pip install --upgrade 'embedchain[pinecone]'
+```
+
+In order to use Pinecone as vector database, set the environment variable `PINECONE_API_KEY` which you can find on [Pinecone dashboard](https://app.pinecone.io/).
+
+<CodeGroup>
+
+```python main.py
+from embedchain import App
+
+# Load pinecone configuration from yaml file
+app = App.from_config(config_path="pod_config.yaml")
+# Or
+app = App.from_config(config_path="serverless_config.yaml")
+```
+
+```yaml pod_config.yaml
+vectordb:
+  provider: pinecone
+  config:
+    metric: cosine
+    vector_dimension: 1536
+    index_name: my-pinecone-index
+    pod_config:
+      environment: gcp-starter
+      metadata_config:
+        indexed:
+          - "url"
+          - "hash"
+```
+
+```yaml serverless_config.yaml
+vectordb:
+  provider: pinecone
+  config:
+    metric: cosine
+    vector_dimension: 1536
+    index_name: my-pinecone-index
+    serverless_config:
+      cloud: aws
+      region: us-west-2
+```
+
+</CodeGroup>
+
+<br />
+<Note>
+You can find more information about Pinecone configuration [here](https://docs.pinecone.io/docs/manage-indexes#create-a-pod-based-index).
+You can also optionally provide `index_name` as a config param in yaml file to specify the index name. If not provided, the index name will be `{collection_name}-{vector_dimension}`.
+</Note>
+
+## Usage
+
+### Hybrid search
+
+Here is an example of how you can do hybrid search using Pinecone as a vector database through Embedchain.
+
+```python
+import os
+
+from embedchain import App
+
+config = {
+    'app': {
+        "config": {
+            "id": "ec-docs-hybrid-search"
+        }
+    },
+    'vectordb': {
+        'provider': 'pinecone',
+        'config': {
+            'metric': 'dotproduct',
+            'vector_dimension': 1536,
+            'index_name': 'my-index',
+            'serverless_config': {
+                'cloud': 'aws',
+                'region': 'us-west-2'
+            },
+            'hybrid_search': True, # Remember to set this for hybrid search
+        }
+    }
+}
+
+# Initialize app
+app = App.from_config(config=config)
+
+# Add documents
+app.add("/path/to/file.pdf", data_type="pdf_file", namespace="my-namespace")
+
+# Query
+app.query("<YOUR QUESTION HERE>", namespace="my-namespace")
+```
+
+Under the hood, Embedchain fetches the relevant chunks from the documents you added by doing hybrid search on the pinecone index.
+If you have questions on how pinecone hybrid search works, please refer to their [offical documentation here](https://docs.pinecone.io/docs/hybrid-search).
+
+<Snippet file="missing-vector-db-tip.mdx" />

+ 23 - 0
docs/components/vector-databases/qdrant.mdx

@@ -0,0 +1,23 @@
+---
+title: Qdrant
+---
+
+In order to use Qdrant as a vector database, set the environment variables `QDRANT_URL` and `QDRANT_API_KEY` which you can find on [Qdrant Dashboard](https://cloud.qdrant.io/).
+
+<CodeGroup>
+```python main.py
+from embedchain import App
+
+# load qdrant configuration from yaml file
+app = App.from_config(config_path="config.yaml")
+```
+
+```yaml config.yaml
+vectordb:
+  provider: qdrant
+  config:
+    collection_name: my_qdrant_index
+```
+</CodeGroup>
+
+<Snippet file="missing-vector-db-tip.mdx" />

+ 24 - 0
docs/components/vector-databases/weaviate.mdx

@@ -0,0 +1,24 @@
+---
+title: Weaviate
+---
+
+
+In order to use Weaviate as a vector database, set the environment variables `WEAVIATE_ENDPOINT` and `WEAVIATE_API_KEY` which you can find on [Weaviate dashboard](https://console.weaviate.cloud/dashboard).
+
+<CodeGroup>
+```python main.py
+from embedchain import App
+
+# load weaviate configuration from yaml file
+app = App.from_config(config_path="config.yaml")
+```
+
+```yaml config.yaml
+vectordb:
+  provider: weaviate
+  config:
+    collection_name: my_weaviate_index
+```
+</CodeGroup>
+
+<Snippet file="missing-vector-db-tip.mdx" />

+ 39 - 0
docs/components/vector-databases/zilliz.mdx

@@ -0,0 +1,39 @@
+---
+title: Zilliz
+---
+
+Install related dependencies using the following command:
+
+```bash
+pip install --upgrade 'embedchain[milvus]'
+```
+
+Set the Zilliz environment variables `ZILLIZ_CLOUD_URI` and `ZILLIZ_CLOUD_TOKEN` which you can find it on their [cloud platform](https://cloud.zilliz.com/).
+
+<CodeGroup>
+
+```python main.py
+import os
+from embedchain import App
+
+os.environ['ZILLIZ_CLOUD_URI'] = 'https://xxx.zillizcloud.com'
+os.environ['ZILLIZ_CLOUD_TOKEN'] = 'xxx'
+
+# load zilliz configuration from yaml file
+app = App.from_config(config_path="config.yaml")
+```
+
+```yaml config.yaml
+vectordb:
+  provider: zilliz
+  config:
+    collection_name: 'zilliz_app'
+    uri: https://xxxx.api.gcp-region.zillizcloud.com
+    token: xxx
+    vector_dim: 1536
+    metric_type: L2
+```
+
+</CodeGroup>
+
+<Snippet file="missing-vector-db-tip.mdx" />

+ 1 - 1
docs/deployment/embedchain_ai.mdx

@@ -5,7 +5,7 @@ description: 'Deploy your RAG application to embedchain.ai platform'
 
 ## Deploy on Embedchain Platform
 
-Embedchain enables developers to deploy their LLM-powered apps in production using the [Embedchain platform](https://app.embedchain.ai). The platform offers free access to context on your data through its REST API. Once the pipeline is deployed, you can update your data sources anytime after deployment.
+Embedchain enables developers to deploy their LLM-powered apps in production using the Embedchain platform. The platform offers free access to context on your data through its REST API. Once the pipeline is deployed, you can update your data sources anytime after deployment.
 
 Deployment to Embedchain Platform is currently available on an invitation-only basis. To request access, please submit your information via the provided [Google Form](https://forms.gle/vigN11h7b4Ywat668). We will review your request and respond promptly.
 

+ 13 - 3
docs/mint.json

@@ -88,9 +88,8 @@
       "pages": [
         "components/introduction",
         {
-          "group": "Data sources",
+          "group": "🗂️ Data sources",
           "pages": [
-
             "components/data-sources/overview",
             {
               "group": "Data types",
@@ -129,8 +128,19 @@
             "components/data-sources/data-type-handling"
           ]
         },
+        {
+          "group": "🗄️ Vector databases",
+          "pages": [
+                "components/vector-databases/chromadb",
+                "components/vector-databases/elasticsearch",
+                "components/vector-databases/pinecone",
+                "components/vector-databases/opensearch",
+                "components/vector-databases/qdrant",
+                "components/vector-databases/weaviate",
+                "components/vector-databases/zilliz"
+          ]
+        },
         "components/llms",
-        "components/vector-databases",
         "components/embedding-models",
         "components/evaluation"
       ]

+ 7 - 0
embedchain/config/vectordb/pinecone.py

@@ -15,6 +15,7 @@ class PineconeDBConfig(BaseVectorDbConfig):
         metric: Optional[str] = "cosine",
         pod_config: Optional[dict[str, any]] = None,
         serverless_config: Optional[dict[str, any]] = None,
+        hybrid_search: bool = False,
         **extra_params: dict[str, any],
     ):
         self.metric = metric
@@ -22,6 +23,7 @@ class PineconeDBConfig(BaseVectorDbConfig):
         self.index_name = index_name
         self.vector_dimension = vector_dimension
         self.extra_params = extra_params
+        self.hybrid_search = hybrid_search
         if pod_config is None and serverless_config is None:
             # If no config is provided, use the default pod spec config
             pod_environment = os.environ.get("PINECONE_ENV", "gcp-starter")
@@ -33,4 +35,9 @@ class PineconeDBConfig(BaseVectorDbConfig):
         if self.pod_config and self.serverless_config:
             raise ValueError("Only one of pod_config or serverless_config can be provided.")
 
+        if self.hybrid_search and self.metric != "dotproduct":
+            raise ValueError(
+                "Hybrid search is only supported with dotproduct metric in Pinecone. See full docs here: https://docs.pinecone.io/docs/hybrid-search#limitations"
+            )  # noqa:E501
+
         super().__init__(collection_name=self.index_name, dir=None)

+ 0 - 40
embedchain/embedchain.py

@@ -237,46 +237,6 @@ class EmbedChain(JSONSerializable):
 
         return source_hash
 
-    def add_local(
-        self,
-        source: Any,
-        data_type: Optional[DataType] = None,
-        metadata: Optional[dict[str, Any]] = None,
-        config: Optional[AddConfig] = None,
-        **kwargs: Optional[dict[str, Any]],
-    ):
-        """
-        Adds the data from the given URL to the vector db.
-        Loads the data, chunks it, create embedding for each chunk
-        and then stores the embedding to vector database.
-
-        Warning:
-            This method is deprecated and will be removed in future versions. Use `add` instead.
-
-        :param source: The data to embed, can be a URL, local file or raw content, depending on the data type.
-        :type source: Any
-        :param data_type: Automatically detected, but can be forced with this argument. The type of the data to add,
-        defaults to None
-        :type data_type: Optional[DataType], optional
-        :param metadata: Metadata associated with the data source., defaults to None
-        :type metadata: Optional[dict[str, Any]], optional
-        :param config: The `AddConfig` instance to use as configuration options., defaults to None
-        :type config: Optional[AddConfig], optional
-        :raises ValueError: Invalid data type
-        :return: source_hash, a md5-hash of the source, in hexadecimal representation.
-        :rtype: str
-        """
-        logging.warning(
-            "The `add_local` method is deprecated and will be removed in future versions. Please use the `add` method for both local and remote files."  # noqa: E501
-        )
-        return self.add(
-            source=source,
-            data_type=data_type,
-            metadata=metadata,
-            config=config,
-            **kwargs,
-        )
-
     def _get_existing_doc_id(self, chunker: BaseChunker, src: Any):
         """
         Get id of existing document for a given source, based on the data type

+ 28 - 7
embedchain/vectordb/pinecone.py

@@ -1,3 +1,4 @@
+import logging
 import os
 from typing import Optional, Union
 
@@ -8,6 +9,8 @@ except ImportError:
         "Pinecone requires extra dependencies. Install with `pip install --upgrade 'embedchain[pinecone]'`"
     ) from None
 
+from pinecone_text.sparse import BM25Encoder
+
 from embedchain.config.vectordb.pinecone import PineconeDBConfig
 from embedchain.helpers.json_serializable import register_deserializable
 from embedchain.utils.misc import chunks
@@ -42,6 +45,14 @@ class PineconeDB(BaseVectorDB):
                 )
             self.config = config
         self._setup_pinecone_index()
+
+        # Setup BM25Encoder if sparse vectors are to be used
+        self.bm25_encoder = None
+        if self.config.hybrid_search:
+            # TODO: Add support for fitting BM25Encoder on any corpus
+            logging.info("Initializing BM25Encoder for sparse vectors..")
+            self.bm25_encoder = BM25Encoder.default()
+
         # Call parent init here because embedder is needed
         super().__init__(config=self.config)
 
@@ -119,12 +130,17 @@ class PineconeDB(BaseVectorDB):
         docs = []
         embeddings = self.embedder.embedding_fn(documents)
         for id, text, metadata, embedding in zip(ids, documents, metadatas, embeddings):
+            # Insert sparse vectors as well if the user wants to do the hybrid search
+            sparse_vector_dict = (
+                {"sparse_values": self.bm25_encoder.encode_documents(text)} if self.bm25_encoder else {}
+            )
             docs.append(
                 {
                     "id": id,
                     "values": embedding,
                     "metadata": {**metadata, "text": text},
-                }
+                    **sparse_vector_dict,
+                },
             )
 
         for chunk in chunks(docs, self.BATCH_SIZE, desc="Adding chunks in batches"):
@@ -159,14 +175,19 @@ class PineconeDB(BaseVectorDB):
             query_filter["app_id"] = {"$eq": app_id}
 
         query_vector = self.embedder.embedding_fn([input_query])[0]
-        data = self.pinecone_index.query(
-            vector=query_vector,
-            filter=query_filter,
-            top_k=n_results,
-            include_metadata=True,
+        params = {
+            "vector": query_vector,
+            "filter": query_filter,
+            "top_k": n_results,
+            "include_metadata": True,
             **kwargs,
-        )
+        }
+
+        if self.bm25_encoder:
+            sparse_query_vector = self.bm25_encoder.encode_queries(input_query)
+            params["sparse_vector"] = sparse_query_vector
 
+        data = self.pinecone_index.query(**params)
         return [
             (metadata.get("text"), {**metadata, "score": doc.get("score")}) if citations else metadata.get("text")
             for doc in data.get("matches", [])

+ 85 - 5
poetry.lock

@@ -3561,6 +3561,50 @@ httpx = ">=0.25.2,<0.26.0"
 orjson = ">=3.9.10,<4.0.0"
 pydantic = ">=2.5.2,<3.0.0"
 
+[[package]]
+name = "mmh3"
+version = "3.1.0"
+description = "Python wrapper for MurmurHash (MurmurHash3), a set of fast and robust hash functions."
+optional = true
+python-versions = "*"
+files = [
+    {file = "mmh3-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:16ee043b1bac040b4324b8baee39df9fdca480a560a6d74f2eef66a5009a234e"},
+    {file = "mmh3-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04ac865319e5b36148a4b6cdf27f8bda091c47c4ab7b355d7f353dfc2b8a3cce"},
+    {file = "mmh3-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9e751f5433417a21c2060b0efa1afc67cfbe29977c867336148c8edb086fae70"},
+    {file = "mmh3-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bdb863b89c1b34e3681d4a3b15d424734940eb8036f3457cb35ef34fb87a503c"},
+    {file = "mmh3-3.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1230930fbf2faec4ddf5b76d0768ae73c102de173c301962bdd468177275adf9"},
+    {file = "mmh3-3.1.0-cp310-cp310-win32.whl", hash = "sha256:b8ed7a2361718795a1b519a08d05f44947a20b27e202b53946561a00dde669c1"},
+    {file = "mmh3-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:29e878e7467a000f34ab68c218ad7ad81312c0a94bc10df3c50a48bcad39dd83"},
+    {file = "mmh3-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c271472325b70d64a4fbb1f2e964ca5b093ac10258e1390f8408890b065868fe"},
+    {file = "mmh3-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0109320f7e0e262123ff4f1acd06acfbc8b3bf19cc13d98c0bc369264430aaeb"},
+    {file = "mmh3-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:524e29dfe66499695f9496edcfc96782d130aabd6ba12c50c72372163cc6f3ea"},
+    {file = "mmh3-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66bdb06a03074e65e614da1aa199b1d16c90608bec9d8fc3faa81d887ffe93cc"},
+    {file = "mmh3-3.1.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a4d471eb75df8320061ab3b8cbe11c970be9f116b01bc2222ebda9c0a777520"},
+    {file = "mmh3-3.1.0-cp311-cp311-win32.whl", hash = "sha256:a886d9ce995a4bdfd7a600ddf61b9015cccbc73c50b898f8ff3c78af24384710"},
+    {file = "mmh3-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:5edb5ac882c04aff8a2a18ae8b74a0c339ac9b83db9820d8456f518bb558e0d8"},
+    {file = "mmh3-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:190fd10981fbd6c67e10ce3b56bcc021562c0df0fee2e2864347d64e65b1783a"},
+    {file = "mmh3-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd781b115cf649811cfde76368c33d2e553b6f88bb41131c314f30d8e65e9d24"},
+    {file = "mmh3-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f48bb0a867077acc1f548591ad49506389f36d18f36dccd10becf071e5cbdda4"},
+    {file = "mmh3-3.1.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d0936a82438e340636a11b9a938378870fc1c7a139632dac09a9a9277351704"},
+    {file = "mmh3-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:d196cc035c2238493248522ae4e54c3cb790549b1564f6dea4d88dfe4b326313"},
+    {file = "mmh3-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:731d37f089b6c212fab1beea24e673161146eb6c76baf9ac074a3424d1172d41"},
+    {file = "mmh3-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9977fb81f8c66f4eee8439734a18dba7826fe78723d15ab53f42db977005be0f"},
+    {file = "mmh3-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:bf4f3f20a8b8405c08b13bc9e4ac33bf55129b50b535cd07ce1891b7f96326ac"},
+    {file = "mmh3-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87cdbc6e70099ad92f17a28b4054ffb1938657e8fb7c1e4e03b194a1b4683fd6"},
+    {file = "mmh3-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6dd81321d14f62aa3711f30533c85a74dc7596e0fee63c8eddd375bc92ab846c"},
+    {file = "mmh3-3.1.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e6eba88e5c1a2778f3de00a9502e3c214ebb757337ece2a7d71e060d188ddfa"},
+    {file = "mmh3-3.1.0-cp38-cp38-win32.whl", hash = "sha256:d91e696925f208d28f3bb7bdf29815524ce955248276af256519bd3538c411ce"},
+    {file = "mmh3-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:cbc2917df568aeb86ec5aa863bfb20fa14e01039cbdce7650efbabc30960df49"},
+    {file = "mmh3-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3b22832d565128be83d69f5d49243bb567840a954df377c9f5b26646a6eec39b"},
+    {file = "mmh3-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ced92a0e285a9111413541c197b0c17d280cee96f7c564b258caf5de5ab8ee01"},
+    {file = "mmh3-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f906833753b4ddcb690c2c1b74e77725868bc3a8b762b7a77737d08be89ae41d"},
+    {file = "mmh3-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72b5685832a7a87a55ebff481794bc410484d7bd4c5e80dae4d8ac50739138ef"},
+    {file = "mmh3-3.1.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d2aa4d422c7c088bbc5d367b45431268ebe6742a0a64eade93fab708e25757c"},
+    {file = "mmh3-3.1.0-cp39-cp39-win32.whl", hash = "sha256:4459bec818f534dc8378568ad89ab310ff47cda3e00ab322edce48dd899bba32"},
+    {file = "mmh3-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:03e04b3480e71828f48d17653451a3286555f0534942cb6ba93065b10ad5f9dc"},
+    {file = "mmh3-3.1.0.tar.gz", hash = "sha256:9b0f2b2ab4a915333c9d1089572e290a021ebb5b900bb7f7114dccc03995d732"},
+]
+
 [[package]]
 name = "mock"
 version = "5.1.0"
@@ -4696,6 +4740,32 @@ urllib3 = ">=1.26.0"
 [package.extras]
 grpc = ["googleapis-common-protos (>=1.53.0)", "grpc-gateway-protoc-gen-openapiv2 (==0.1.0)", "grpcio (>=1.44.0)", "lz4 (>=3.1.3)", "protobuf (>=3.20.0,<3.21.0)"]
 
+[[package]]
+name = "pinecone-text"
+version = "0.8.0"
+description = "Text utilities library by Pinecone.io"
+optional = true
+python-versions = ">=3.8,<4.0"
+files = [
+    {file = "pinecone_text-0.8.0-py3-none-any.whl", hash = "sha256:cf099c903f6bc630a2b9858bab63e291ebb361ca545b5968cd71eb0dcfbee221"},
+    {file = "pinecone_text-0.8.0.tar.gz", hash = "sha256:9c386d43da7a0959452296217c3d77a6f431ff6602a06f4d413137f4ba3d82ee"},
+]
+
+[package.dependencies]
+mmh3 = ">=3.1.0,<4.0.0"
+nltk = ">=3.6.5,<4.0.0"
+numpy = {version = ">=1.21.5,<2.0", markers = "python_version < \"3.12\""}
+python-dotenv = ">=1.0.1,<2.0.0"
+requests = ">=2.25.0,<3.0.0"
+types-requests = ">=2.25.0,<3.0.0"
+wget = ">=3.2,<4.0"
+
+[package.extras]
+cohere = ["cohere (>=4.37,<5.0)"]
+dense = ["openai (>=1.2.3,<2.0.0)", "sentence-transformers (>=2.0.0)", "torch (>=1.13.1)", "transformers (>=4.26.1)"]
+openai = ["openai (>=1.2.3,<2.0.0)"]
+splade = ["sentence-transformers (>=2.0.0)", "torch (>=1.13.1)", "transformers (>=4.26.1)"]
+
 [[package]]
 name = "platformdirs"
 version = "3.11.0"
@@ -5575,13 +5645,13 @@ typing-extensions = "*"
 
 [[package]]
 name = "python-dotenv"
-version = "1.0.0"
+version = "1.0.1"
 description = "Read key-value pairs from a .env file and set them as environment variables"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"},
-    {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"},
+    {file = "python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca"},
+    {file = "python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a"},
 ]
 
 [package.extras]
@@ -7985,6 +8055,16 @@ MarkupSafe = ">=2.1.1"
 [package.extras]
 watchdog = ["watchdog (>=2.3)"]
 
+[[package]]
+name = "wget"
+version = "3.2"
+description = "pure python download utility"
+optional = true
+python-versions = "*"
+files = [
+    {file = "wget-3.2.zip", hash = "sha256:35e630eca2aa50ce998b9b1a127bb26b30dfee573702782aa982f875e3f16061"},
+]
+
 [[package]]
 name = "wheel"
 version = "0.41.2"
@@ -8266,7 +8346,7 @@ modal = ["modal"]
 mysql = ["mysql-connector-python"]
 opensearch = ["opensearch-py"]
 opensource = ["gpt4all", "sentence-transformers", "torch"]
-pinecone = ["pinecone-client"]
+pinecone = ["pinecone-client", "pinecone-text"]
 poe = ["fastapi-poe"]
 postgres = ["psycopg", "psycopg-binary", "psycopg-pool"]
 qdrant = ["qdrant-client"]
@@ -8282,4 +8362,4 @@ youtube = ["youtube-transcript-api", "yt_dlp"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.12"
-content-hash = "1dbb690590123f505675544aa0e1b3668f0d3819f4832f3f3464ff16b69e39e9"
+content-hash = "f613dc1a3e9b724c95b407d4d8b9e67518e718142c77ad4723b7cb1e43eec9db"

+ 3 - 2
pyproject.toml

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "embedchain"
-version = "0.1.79"
+version = "0.1.80"
 description = "Simplest open source retrieval(RAG) framework"
 authors = [
     "Taranjeet Singh <taranjeet@embedchain.ai>",
@@ -124,6 +124,7 @@ together = { version = "^0.2.8", optional = true }
 weaviate-client = { version = "^3.24.1", optional = true }
 docx2txt = { version = "^0.8", optional = true }
 pinecone-client = { version = "^3.0.0", optional = true }
+pinecone-text = { version = "^0.8.0", optional = true }
 qdrant-client = { version = "1.6.3", optional = true }
 unstructured = {extras = ["local-inference", "all-docs"], version = "^0.10.18", optional = true}
 huggingface_hub = { version = "^0.17.3", optional = true }
@@ -178,7 +179,7 @@ discord = ["discord"]
 slack = ["slack-sdk", "flask"]
 whatsapp = ["twilio", "flask"]
 weaviate = ["weaviate-client"]
-pinecone = ["pinecone-client"]
+pinecone = ["pinecone-client", "pinecone-text"]
 qdrant = ["qdrant-client"]
 huggingface_hub=["huggingface_hub"]
 cohere = ["cohere"]