1 gadu atpakaļ · 48b24f6f12
--- a/docs/components/vector-databases/lancedb.mdx
+++ b/docs/components/vector-databases/lancedb.mdx
@@ -0,0 +1,48 @@
 
				+---
			
 
				+title: LanceDB
			
 
				+---
			
 
				+
			
 
				+## Install Embedchain with LanceDB
			
 
				+
			
 
				+Install Embedchain, LanceDB and  related dependencies using the following command:
			
 
				+
			
 
				+```bash
			
 
				+pip install "embedchain[lancedb]"
			
 
				+```
			
 
				+
			
 
				+LanceDB is a developer-friendly, open source database for AI. From hyper scalable vector search and advanced retrieval for RAG, to streaming training data and interactive exploration of large scale AI datasets.
			
 
				+In order to use LanceDB as vector database, not need to set any key for local use. 
			
 
				+
			
 
				+<CodeGroup>
			
 
				+```python main.py
			
 
				+import os
			
 
				+from embedchain import App
			
 
				+
			
 
				+# set OPENAI_API_KEY as env variable
			
 
				+os.environ["OPENAI_API_KEY"] = "sk-xxx"
			
 
				+
			
 
				+# Create Embedchain App and set config
			
 
				+app = App.from_config(config={
			
 
				+    "vectordb": {
			
 
				+        "provider": "lancedb",
			
 
				+            "config": {
			
 
				+                "collection_name": "lancedb-index"
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+)
			
 
				+
			
 
				+# Add data source and start queryin
			
 
				+app.add("https://www.forbes.com/profile/elon-musk")
			
 
				+
			
 
				+# query continuously
			
 
				+while(True):
			
 
				+    question = input("Enter question: ")
			
 
				+    if question in ['q', 'exit', 'quit']:
			
 
				+        break
			
 
				+    answer = app.query(question)
			
 
				+    print(answer)
			
 
				+```
			
 
				+
			
 
				+</CodeGroup>
			
 
				+<Snippet file="missing-vector-db-tip.mdx" />
			
--- a/embedchain/config/vectordb/lancedb.py
+++ b/embedchain/config/vectordb/lancedb.py
@@ -0,0 +1,33 @@
 
				+from typing import Optional
			
 
				+
			
 
				+from embedchain.config.vectordb.base import BaseVectorDbConfig
			
 
				+from embedchain.helpers.json_serializable import register_deserializable
			
 
				+
			
 
				+
			
 
				+@register_deserializable
			
 
				+class LanceDBConfig(BaseVectorDbConfig):
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        collection_name: Optional[str] = None,
			
 
				+        dir: Optional[str] = None,
			
 
				+        host: Optional[str] = None,
			
 
				+        port: Optional[str] = None,
			
 
				+        allow_reset=True,
			
 
				+    ):
			
 
				+        """
			
 
				+        Initializes a configuration class instance for LanceDB.
			
 
				+
			
 
				+        :param collection_name: Default name for the collection, defaults to None
			
 
				+        :type collection_name: Optional[str], optional
			
 
				+        :param dir: Path to the database directory, where the database is stored, defaults to None
			
 
				+        :type dir: Optional[str], optional
			
 
				+        :param host: Database connection remote host. Use this if you run Embedchain as a client, defaults to None
			
 
				+        :type host: Optional[str], optional
			
 
				+        :param port: Database connection remote port. Use this if you run Embedchain as a client, defaults to None
			
 
				+        :type port: Optional[str], optional
			
 
				+        :param allow_reset: Resets the database. defaults to False
			
 
				+        :type allow_reset: bool
			
 
				+        """
			
 
				+
			
 
				+        self.allow_reset = allow_reset
			
 
				+        super().__init__(collection_name=collection_name, dir=dir, host=host, port=port)
			
--- a/embedchain/embedchain.py
+++ b/embedchain/embedchain.py
@@ -6,7 +6,9 @@ from typing import Any, Optional, Union
 
				 from dotenv import load_dotenv
			
 
				 from langchain.docstore.document import Document
			
 
				 
			
 
				-from embedchain.cache import adapt, get_gptcache_session, gptcache_data_convert, gptcache_update_cache_callback
			
 
				+from embedchain.cache import (adapt, get_gptcache_session,
			
 
				+                              gptcache_data_convert,
			
 
				+                              gptcache_update_cache_callback)
			
 
				 from embedchain.chunkers.base_chunker import BaseChunker
			
 
				 from embedchain.config import AddConfig, BaseLlmConfig, ChunkerConfig
			
 
				 from embedchain.config.base_app_config import BaseAppConfig
			
@@ -16,7 +18,8 @@ from embedchain.embedder.base import BaseEmbedder
 
				 from embedchain.helpers.json_serializable import JSONSerializable
			
 
				 from embedchain.llm.base import BaseLlm
			
 
				 from embedchain.loaders.base_loader import BaseLoader
			
 
				-from embedchain.models.data_type import DataType, DirectDataType, IndirectDataType, SpecialDataType
			
 
				+from embedchain.models.data_type import (DataType, DirectDataType,
			
 
				+                                         IndirectDataType, SpecialDataType)
			
 
				 from embedchain.utils.misc import detect_datatype, is_valid_json_string
			
 
				 from embedchain.vectordb.base import BaseVectorDB
			
 
				 
			
--- a/embedchain/factory.py
+++ b/embedchain/factory.py
@@ -91,6 +91,7 @@ class VectorDBFactory:
 
				         "chroma": "embedchain.vectordb.chroma.ChromaDB",
			
 
				         "elasticsearch": "embedchain.vectordb.elasticsearch.ElasticsearchDB",
			
 
				         "opensearch": "embedchain.vectordb.opensearch.OpenSearchDB",
			
 
				+        "lancedb": "embedchain.vectordb.lancedb.LanceDB",
			
 
				         "pinecone": "embedchain.vectordb.pinecone.PineconeDB",
			
 
				         "qdrant": "embedchain.vectordb.qdrant.QdrantDB",
			
 
				         "weaviate": "embedchain.vectordb.weaviate.WeaviateDB",
			
@@ -100,6 +101,7 @@ class VectorDBFactory:
 
				         "chroma": "embedchain.config.vectordb.chroma.ChromaDbConfig",
			
 
				         "elasticsearch": "embedchain.config.vectordb.elasticsearch.ElasticsearchDBConfig",
			
 
				         "opensearch": "embedchain.config.vectordb.opensearch.OpenSearchDBConfig",
			
 
				+        "lancedb": "embedchain.config.vectordb.lancedb.LanceDBConfig",
			
 
				         "pinecone": "embedchain.config.vectordb.pinecone.PineconeDBConfig",
			
 
				         "qdrant": "embedchain.config.vectordb.qdrant.QdrantDBConfig",
			
 
				         "weaviate": "embedchain.config.vectordb.weaviate.WeaviateDBConfig",
			
--- a/embedchain/llm/base.py
+++ b/embedchain/llm/base.py
@@ -5,7 +5,9 @@ from typing import Any, Optional
 
				 from langchain.schema import BaseMessage as LCBaseMessage
			
 
				 
			
 
				 from embedchain.config import BaseLlmConfig
			
 
				-from embedchain.config.llm.base import DEFAULT_PROMPT, DEFAULT_PROMPT_WITH_HISTORY_TEMPLATE, DOCS_SITE_PROMPT_TEMPLATE
			
 
				+from embedchain.config.llm.base import (DEFAULT_PROMPT,
			
 
				+                                        DEFAULT_PROMPT_WITH_HISTORY_TEMPLATE,
			
 
				+                                        DOCS_SITE_PROMPT_TEMPLATE)
			
 
				 from embedchain.helpers.json_serializable import JSONSerializable
			
 
				 from embedchain.memory.base import ChatHistory
			
 
				 from embedchain.memory.message import ChatMessage
			
--- a/embedchain/llm/jina.py
+++ b/embedchain/llm/jina.py
@@ -35,7 +35,8 @@ class JinaLlm(BaseLlm):
 
				         if config.top_p:
			
 
				             kwargs["model_kwargs"]["top_p"] = config.top_p
			
 
				         if config.stream:
			
 
				-            from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
			
 
				+            from langchain.callbacks.streaming_stdout import \
			
 
				+                StreamingStdOutCallbackHandler
			
 
				 
			
 
				             chat = JinaChat(**kwargs, streaming=config.stream, callbacks=[StreamingStdOutCallbackHandler()])
			
 
				         else:
			
--- a/embedchain/loaders/audio.py
+++ b/embedchain/loaders/audio.py
@@ -1,6 +1,8 @@
 
				-import os
			
 
				 import hashlib
			
 
				+import os
			
 
				+
			
 
				 import validators
			
 
				+
			
 
				 from embedchain.helpers.json_serializable import register_deserializable
			
 
				 from embedchain.loaders.base_loader import BaseLoader
			
 
				 
			
--- a/embedchain/loaders/unstructured_file.py
+++ b/embedchain/loaders/unstructured_file.py
@@ -11,7 +11,8 @@ class UnstructuredLoader(BaseLoader):
 
				         """Load data from an Unstructured file."""
			
 
				         try:
			
 
				             import unstructured  # noqa: F401
			
 
				-            from langchain_community.document_loaders import UnstructuredFileLoader
			
 
				+            from langchain_community.document_loaders import \
			
 
				+                UnstructuredFileLoader
			
 
				         except ImportError:
			
 
				             raise ImportError(
			
 
				                 'Unstructured file requires extra dependencies. Install with `pip install "unstructured[local-inference, all-docs]"`'  # noqa: E501
			
--- a/embedchain/utils/misc.py
+++ b/embedchain/utils/misc.py
@@ -446,7 +446,7 @@ def validate_config(config_data):
 
				             },
			
 
				             Optional("vectordb"): {
			
 
				                 Optional("provider"): Or(
			
 
				-                    "chroma", "elasticsearch", "opensearch", "pinecone", "qdrant", "weaviate", "zilliz"
			
 
				+                    "chroma", "elasticsearch", "opensearch", "lancedb", "pinecone", "qdrant", "weaviate", "zilliz"
			
 
				                 ),
			
 
				                 Optional("config"): object,  # TODO: add particular config schema for each provider
			
 
				             },
			
--- a/embedchain/vectordb/lancedb.py
+++ b/embedchain/vectordb/lancedb.py
@@ -0,0 +1,307 @@
 
				+from typing import Any, Dict, List, Optional, Union
			
 
				+
			
 
				+import pyarrow as pa
			
 
				+
			
 
				+try:
			
 
				+    import lancedb
			
 
				+except ImportError:
			
 
				+    raise ImportError('LanceDB is required. Install with pip install "embedchain[lancedb]"') from None
			
 
				+
			
 
				+from embedchain.config.vectordb.lancedb import LanceDBConfig
			
 
				+from embedchain.helpers.json_serializable import register_deserializable
			
 
				+from embedchain.vectordb.base import BaseVectorDB
			
 
				+
			
 
				+
			
 
				+@register_deserializable
			
 
				+class LanceDB(BaseVectorDB):
			
 
				+    """
			
 
				+    LanceDB as vector database
			
 
				+    """
			
 
				+
			
 
				+    BATCH_SIZE = 100
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        config: Optional[LanceDBConfig] = None,
			
 
				+    ):
			
 
				+        """LanceDB as vector database.
			
 
				+
			
 
				+        :param config: LanceDB database config, defaults to None
			
 
				+        :type config: LanceDBConfig, optional
			
 
				+        """
			
 
				+        if config:
			
 
				+            self.config = config
			
 
				+        else:
			
 
				+            self.config = LanceDBConfig()
			
 
				+
			
 
				+        self.client = lancedb.connect(self.config.dir or "~/.lancedb")
			
 
				+        self.embedder_check = True
			
 
				+
			
 
				+        super().__init__(config=self.config)
			
 
				+
			
 
				+    def _initialize(self):
			
 
				+        """
			
 
				+        This method is needed because `embedder` attribute needs to be set externally before it can be initialized.
			
 
				+        """
			
 
				+        if not self.embedder:
			
 
				+            raise ValueError(
			
 
				+                "Embedder not set. Please set an embedder with `_set_embedder()` function before initialization."
			
 
				+            )
			
 
				+        else:
			
 
				+            # check embedder function is working or not
			
 
				+            try:
			
 
				+                self.embedder.embedding_fn("Hello LanceDB")
			
 
				+            except Exception:
			
 
				+                self.embedder_check = False
			
 
				+
			
 
				+        self._get_or_create_collection(self.config.collection_name)
			
 
				+
			
 
				+    def _get_or_create_db(self):
			
 
				+        """
			
 
				+        Called during initialization
			
 
				+        """
			
 
				+        return self.client
			
 
				+
			
 
				+    def _generate_where_clause(self, where: Dict[str, any]) -> str:
			
 
				+        """
			
 
				+        This method generate where clause using dictionary containing attributes and their values
			
 
				+        """
			
 
				+
			
 
				+        where_filters = ""
			
 
				+
			
 
				+        if len(list(where.keys())) == 1:
			
 
				+            where_filters = f"{list(where.keys())[0]} = {list(where.values())[0]}"
			
 
				+            return where_filters
			
 
				+
			
 
				+        where_items = list(where.items())
			
 
				+        where_count = len(where_items)
			
 
				+
			
 
				+        for i, (key, value) in enumerate(where_items, start=1):
			
 
				+            condition = f"{key} = {value} AND "
			
 
				+            where_filters += condition
			
 
				+
			
 
				+            if i == where_count:
			
 
				+                condition = f"{key} = {value}"
			
 
				+                where_filters += condition
			
 
				+
			
 
				+        return where_filters
			
 
				+
			
 
				+    def _get_or_create_collection(self, table_name: str, reset=False):
			
 
				+        """
			
 
				+        Get or create a named collection.
			
 
				+
			
 
				+        :param name: Name of the collection
			
 
				+        :type name: str
			
 
				+        :return: Created collection
			
 
				+        :rtype: Collection
			
 
				+        """
			
 
				+        if not self.embedder_check:
			
 
				+            schema = pa.schema(
			
 
				+                [
			
 
				+                    pa.field("doc", pa.string()),
			
 
				+                    pa.field("metadata", pa.string()),
			
 
				+                    pa.field("id", pa.string()),
			
 
				+                ]
			
 
				+            )
			
 
				+
			
 
				+        else:
			
 
				+            schema = pa.schema(
			
 
				+                [
			
 
				+                    pa.field("vector", pa.list_(pa.float32(), list_size=self.embedder.vector_dimension)),
			
 
				+                    pa.field("doc", pa.string()),
			
 
				+                    pa.field("metadata", pa.string()),
			
 
				+                    pa.field("id", pa.string()),
			
 
				+                ]
			
 
				+            )
			
 
				+
			
 
				+        if not reset:
			
 
				+            if table_name not in self.client.table_names():
			
 
				+                self.collection = self.client.create_table(table_name, schema=schema)
			
 
				+
			
 
				+        else:
			
 
				+            self.client.drop_table(table_name)
			
 
				+            self.collection = self.client.create_table(table_name, schema=schema)
			
 
				+
			
 
				+        self.collection = self.client[table_name]
			
 
				+
			
 
				+        return self.collection
			
 
				+
			
 
				+    def get(self, ids: Optional[List[str]] = None, where: Optional[Dict[str, any]] = None, limit: Optional[int] = None):
			
 
				+        """
			
 
				+        Get existing doc ids present in vector database
			
 
				+
			
 
				+        :param ids: list of doc ids to check for existence
			
 
				+        :type ids: List[str]
			
 
				+        :param where: Optional. to filter data
			
 
				+        :type where: Dict[str, Any]
			
 
				+        :param limit: Optional. maximum number of documents
			
 
				+        :type limit: Optional[int]
			
 
				+        :return: Existing documents.
			
 
				+        :rtype: List[str]
			
 
				+        """
			
 
				+        if limit is not None:
			
 
				+            max_limit = limit
			
 
				+        else:
			
 
				+            max_limit = 3
			
 
				+        results = {"ids": [], "metadatas": []}
			
 
				+
			
 
				+        where_clause = {}
			
 
				+        if where:
			
 
				+            where_clause = self._generate_where_clause(where)
			
 
				+
			
 
				+        if ids is not None:
			
 
				+            records = (
			
 
				+                self.collection.to_lance().scanner(filter=f"id IN {tuple(ids)}", columns=["id"]).to_table().to_pydict()
			
 
				+            )
			
 
				+            for id in records["id"]:
			
 
				+                if where is not None:
			
 
				+                    result = (
			
 
				+                        self.collection.search(query=id, vector_column_name="id")
			
 
				+                        .where(where_clause)
			
 
				+                        .limit(max_limit)
			
 
				+                        .to_list()
			
 
				+                    )
			
 
				+                else:
			
 
				+                    result = self.collection.search(query=id, vector_column_name="id").limit(max_limit).to_list()
			
 
				+                results["ids"] = [r["id"] for r in result]
			
 
				+                results["metadatas"] = [r["metadata"] for r in result]
			
 
				+
			
 
				+        return results
			
 
				+
			
 
				+    def add(
			
 
				+        self,
			
 
				+        documents: List[str],
			
 
				+        metadatas: List[object],
			
 
				+        ids: List[str],
			
 
				+    ) -> Any:
			
 
				+        """
			
 
				+        Add vectors to lancedb database
			
 
				+
			
 
				+        :param documents: Documents
			
 
				+        :type documents: List[str]
			
 
				+        :param metadatas: Metadatas
			
 
				+        :type metadatas: List[object]
			
 
				+        :param ids: ids
			
 
				+        :type ids: List[str]
			
 
				+        """
			
 
				+        data = []
			
 
				+        to_ingest = list(zip(documents, metadatas, ids))
			
 
				+
			
 
				+        if not self.embedder_check:
			
 
				+            for doc, meta, id in to_ingest:
			
 
				+                temp = {}
			
 
				+                temp["doc"] = doc
			
 
				+                temp["metadata"] = str(meta)
			
 
				+                temp["id"] = id
			
 
				+                data.append(temp)
			
 
				+        else:
			
 
				+            for doc, meta, id in to_ingest:
			
 
				+                temp = {}
			
 
				+                temp["doc"] = doc
			
 
				+                temp["vector"] = self.embedder.embedding_fn([doc])[0]
			
 
				+                temp["metadata"] = str(meta)
			
 
				+                temp["id"] = id
			
 
				+                data.append(temp)
			
 
				+
			
 
				+        self.collection.add(data=data)
			
 
				+
			
 
				+    def _format_result(self, results) -> list:
			
 
				+        """
			
 
				+        Format LanceDB results
			
 
				+
			
 
				+        :param results: LanceDB query results to format.
			
 
				+        :type results: QueryResult
			
 
				+        :return: Formatted results
			
 
				+        :rtype: list[tuple[Document, float]]
			
 
				+        """
			
 
				+        return results.tolist()
			
 
				+
			
 
				+    def query(
			
 
				+        self,
			
 
				+        input_query: str,
			
 
				+        n_results: int = 3,
			
 
				+        where: Optional[dict[str, any]] = None,
			
 
				+        raw_filter: Optional[dict[str, any]] = None,
			
 
				+        citations: bool = False,
			
 
				+        **kwargs: Optional[dict[str, any]],
			
 
				+    ) -> Union[list[tuple[str, dict]], list[str]]:
			
 
				+        """
			
 
				+        Query contents from vector database based on vector similarity
			
 
				+
			
 
				+        :param input_query: query string
			
 
				+        :type input_query: str
			
 
				+        :param n_results: no of similar documents to fetch from database
			
 
				+        :type n_results: int
			
 
				+        :param where: to filter data
			
 
				+        :type where: dict[str, Any]
			
 
				+        :param raw_filter: Raw filter to apply
			
 
				+        :type raw_filter: dict[str, Any]
			
 
				+        :param citations: we use citations boolean param to return context along with the answer.
			
 
				+        :type citations: bool, default is False.
			
 
				+        :raises InvalidDimensionException: Dimensions do not match.
			
 
				+        :return: The content of the document that matched your query,
			
 
				+        along with url of the source and doc_id (if citations flag is true)
			
 
				+        :rtype: list[str], if citations=False, otherwise list[tuple[str, str, str]]
			
 
				+        """
			
 
				+        if where and raw_filter:
			
 
				+            raise ValueError("Both `where` and `raw_filter` cannot be used together.")
			
 
				+        try:
			
 
				+            query_embedding = self.embedder.embedding_fn(input_query)[0]
			
 
				+            result = self.collection.search(query_embedding).limit(n_results).to_list()
			
 
				+        except Exception as e:
			
 
				+            e.message()
			
 
				+
			
 
				+        results_formatted = result
			
 
				+
			
 
				+        contexts = []
			
 
				+        for result in results_formatted:
			
 
				+            if citations:
			
 
				+                metadata = result["metadata"]
			
 
				+                contexts.append((result["doc"], metadata))
			
 
				+            else:
			
 
				+                contexts.append(result["doc"])
			
 
				+        return contexts
			
 
				+
			
 
				+    def set_collection_name(self, name: str):
			
 
				+        """
			
 
				+        Set the name of the collection. A collection is an isolated space for vectors.
			
 
				+
			
 
				+        :param name: Name of the collection.
			
 
				+        :type name: str
			
 
				+        """
			
 
				+        if not isinstance(name, str):
			
 
				+            raise TypeError("Collection name must be a string")
			
 
				+        self.config.collection_name = name
			
 
				+        self._get_or_create_collection(self.config.collection_name)
			
 
				+
			
 
				+    def count(self) -> int:
			
 
				+        """
			
 
				+        Count number of documents/chunks embedded in the database.
			
 
				+
			
 
				+        :return: number of documents
			
 
				+        :rtype: int
			
 
				+        """
			
 
				+        return self.collection.count_rows()
			
 
				+
			
 
				+    def delete(self, where):
			
 
				+        return self.collection.delete(where=where)
			
 
				+
			
 
				+    def reset(self):
			
 
				+        """
			
 
				+        Resets the database. Deletes all embeddings irreversibly.
			
 
				+        """
			
 
				+        # Delete all data from the collection and recreate collection
			
 
				+        if self.config.allow_reset:
			
 
				+            try:
			
 
				+                self._get_or_create_collection(self.config.collection_name, reset=True)
			
 
				+            except ValueError:
			
 
				+                raise ValueError(
			
 
				+                    "For safety reasons, resetting is disabled. "
			
 
				+                    "Please enable it by setting `allow_reset=True` in your LanceDbConfig"
			
 
				+                ) from None
			
 
				+        # Recreate
			
 
				+        else:
			
 
				+            print(
			
 
				+                "For safety reasons, resetting is disabled. "
			
 
				+                "Please enable it by setting `allow_reset=True` in your LanceDbConfig"
			
 
				+            )
			
--- a/notebooks/lancedb.ipynb
+++ b/notebooks/lancedb.ipynb
@@ -0,0 +1,146 @@
 
				+{
			
 
				+  "cells": [
			
 
				+    {
			
 
				+      "cell_type": "markdown",
			
 
				+      "metadata": {
			
 
				+        "id": "b02n_zJ_hl3d"
			
 
				+      },
			
 
				+      "source": [
			
 
				+        "## Cookbook for using LanceDB with Embedchain"
			
 
				+      ]
			
 
				+    },
			
 
				+    {
			
 
				+      "cell_type": "markdown",
			
 
				+      "metadata": {
			
 
				+        "id": "gyJ6ui2vhtMY"
			
 
				+      },
			
 
				+      "source": [
			
 
				+        "### Step-1: Install embedchain package"
			
 
				+      ]
			
 
				+    },
			
 
				+    {
			
 
				+      "cell_type": "code",
			
 
				+      "execution_count": null,
			
 
				+      "metadata": {
			
 
				+        "id": "-NbXjAdlh0vJ"
			
 
				+      },
			
 
				+      "outputs": [],
			
 
				+      "source": [
			
 
				+        "! pip install embedchain lancedb"
			
 
				+      ]
			
 
				+    },
			
 
				+    {
			
 
				+      "cell_type": "markdown",
			
 
				+      "metadata": {
			
 
				+        "id": "nGnpSYAAh2bQ"
			
 
				+      },
			
 
				+      "source": [
			
 
				+        "### Step-2: Set environment variables needed for LanceDB\n",
			
 
				+        "\n",
			
 
				+        "You can find this env variable on your [OpenAI](https://platform.openai.com/account/api-keys)."
			
 
				+      ]
			
 
				+    },
			
 
				+    {
			
 
				+      "cell_type": "code",
			
 
				+      "execution_count": null,
			
 
				+      "metadata": {
			
 
				+        "id": "0fBdQ9GAiRvK"
			
 
				+      },
			
 
				+      "outputs": [],
			
 
				+      "source": [
			
 
				+        "import os\n",
			
 
				+        "from embedchain import App\n",
			
 
				+        "\n",
			
 
				+        "os.environ[\"OPENAI_API_KEY\"] = \"sk-xxx\""
			
 
				+      ]
			
 
				+    },
			
 
				+    {
			
 
				+      "cell_type": "markdown",
			
 
				+      "metadata": {
			
 
				+        "id": "PGt6uPLIi1CS"
			
 
				+      },
			
 
				+      "source": [
			
 
				+        "### Step-3 Create embedchain app and define your config"
			
 
				+      ]
			
 
				+    },
			
 
				+    {
			
 
				+      "cell_type": "code",
			
 
				+      "execution_count": null,
			
 
				+      "metadata": {
			
 
				+        "id": "Amzxk3m-i3tD"
			
 
				+      },
			
 
				+      "outputs": [],
			
 
				+      "source": [
			
 
				+        "app = App.from_config(config={\n",
			
 
				+        "    \"vectordb\": {\n",
			
 
				+        "        \"provider\": \"lancedb\",\n",
			
 
				+        "            \"config\": {\n",
			
 
				+        "                \"collection_name\": \"lancedb-index\"\n",
			
 
				+        "            }\n",
			
 
				+        "        }\n",
			
 
				+        "    }\n",
			
 
				+        ")"
			
 
				+      ]
			
 
				+    },
			
 
				+    {
			
 
				+      "cell_type": "markdown",
			
 
				+      "metadata": {
			
 
				+        "id": "XNXv4yZwi7ef"
			
 
				+      },
			
 
				+      "source": [
			
 
				+        "### Step-4: Add data sources to your app"
			
 
				+      ]
			
 
				+    },
			
 
				+    {
			
 
				+      "cell_type": "code",
			
 
				+      "execution_count": null,
			
 
				+      "metadata": {
			
 
				+        "id": "Sn_0rx9QjIY9"
			
 
				+      },
			
 
				+      "outputs": [],
			
 
				+      "source": [
			
 
				+        "app.add(\"https://www.forbes.com/profile/elon-musk\")"
			
 
				+      ]
			
 
				+    },
			
 
				+    {
			
 
				+      "cell_type": "markdown",
			
 
				+      "metadata": {
			
 
				+        "id": "_7W6fDeAjMAP"
			
 
				+      },
			
 
				+      "source": [
			
 
				+        "### Step-5: All set. Now start asking questions related to your data"
			
 
				+      ]
			
 
				+    },
			
 
				+    {
			
 
				+      "cell_type": "code",
			
 
				+      "execution_count": null,
			
 
				+      "metadata": {
			
 
				+        "id": "cvIK7dWRjN_f"
			
 
				+      },
			
 
				+      "outputs": [],
			
 
				+      "source": [
			
 
				+        "while(True):\n",
			
 
				+        "    question = input(\"Enter question: \")\n",
			
 
				+        "    if question in ['q', 'exit', 'quit']:\n",
			
 
				+        "        break\n",
			
 
				+        "    answer = app.query(question)\n",
			
 
				+        "    print(answer)"
			
 
				+      ]
			
 
				+    }
			
 
				+  ],
			
 
				+  "metadata": {
			
 
				+    "colab": {
			
 
				+      "provenance": []
			
 
				+    },
			
 
				+    "kernelspec": {
			
 
				+      "display_name": "Python 3",
			
 
				+      "name": "python3"
			
 
				+    },
			
 
				+    "language_info": {
			
 
				+      "name": "python",
			
 
				+      "version": "3.11.4"
			
 
				+    }
			
 
				+  },
			
 
				+  "nbformat": 4,
			
 
				+  "nbformat_minor": 0
			
 
				+}
			
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
 
				-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
			
 
				+# This file is automatically @generated by Poetry 1.7.0 and should not be changed by hand.
			
 
				 
			
 
				 [[package]]
			
 
				 name = "aiohttp"
			
@@ -1142,6 +1142,17 @@ files = [
 
				 marshmallow = ">=3.18.0,<4.0.0"
			
 
				 typing-inspect = ">=0.4.0,<1"
			
 
				 
			
 
				+[[package]]
			
 
				+name = "decorator"
			
 
				+version = "5.1.1"
			
 
				+description = "Decorators for Humans"
			
 
				+optional = true
			
 
				+python-versions = ">=3.5"
			
 
				+files = [
			
 
				+    {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
			
 
				+    {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
			
 
				+]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "deprecated"
			
 
				 version = "1.2.14"
			
@@ -1159,6 +1170,20 @@ wrapt = ">=1.10,<2"
 
				 [package.extras]
			
 
				 dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "deprecation"
			
 
				+version = "2.1.0"
			
 
				+description = "A library to handle automated deprecations"
			
 
				+optional = true
			
 
				+python-versions = "*"
			
 
				+files = [
			
 
				+    {file = "deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a"},
			
 
				+    {file = "deprecation-2.1.0.tar.gz", hash = "sha256:72b3bde64e5d778694b0cf68178aed03d15e15477116add3fb773e581f9518ff"},
			
 
				+]
			
 
				+
			
 
				+[package.dependencies]
			
 
				+packaging = "*"
			
 
				+
			
 
				 [[package]]
			
 
				 name = "discord"
			
 
				 version = "2.3.2"
			
@@ -2785,6 +2810,42 @@ websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0"
 
				 [package.extras]
			
 
				 adal = ["adal (>=1.0.2)"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "lancedb"
			
 
				+version = "0.6.13"
			
 
				+description = "lancedb"
			
 
				+optional = true
			
 
				+python-versions = ">=3.8"
			
 
				+files = [
			
 
				+    {file = "lancedb-0.6.13-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:4667353ca7fa187e94cb0ca4c5f9577d65eb5160f6f3fe9e57902d86312c3869"},
			
 
				+    {file = "lancedb-0.6.13-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:2e22533fe6f6b2d7037dcdbbb4019a62402bbad4ce18395be68f4aa007bf8bc0"},
			
 
				+    {file = "lancedb-0.6.13-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:837eaceafb87e3ae4c261eef45c4f73715f892a36165572c3da621dbdb45afcf"},
			
 
				+    {file = "lancedb-0.6.13-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:61af2d72b2a2f0ea419874c3f32760fe5e51530da3be2d65251a0e6ded74419b"},
			
 
				+    {file = "lancedb-0.6.13-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:31b24e57ee313f4ce6255e45d42e8bee19b90ddcd13a9e07030ac04f76e7dfde"},
			
 
				+    {file = "lancedb-0.6.13-cp38-abi3-win_amd64.whl", hash = "sha256:b851182d8492b1e5b57a441af64c95da65ca30b045d6618dc7d203c6d60d70fa"},
			
 
				+]
			
 
				+
			
 
				+[package.dependencies]
			
 
				+attrs = ">=21.3.0"
			
 
				+cachetools = "*"
			
 
				+deprecation = "*"
			
 
				+overrides = ">=0.7"
			
 
				+pydantic = ">=1.10"
			
 
				+pylance = "0.10.12"
			
 
				+ratelimiter = ">=1.0,<2.0"
			
 
				+requests = ">=2.31.0"
			
 
				+retry = ">=0.9.2"
			
 
				+semver = "*"
			
 
				+tqdm = ">=4.27.0"
			
 
				+
			
 
				+[package.extras]
			
 
				+azure = ["adlfs (>=2024.2.0)"]
			
 
				+clip = ["open-clip", "pillow", "torch"]
			
 
				+dev = ["pre-commit", "ruff"]
			
 
				+docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
			
 
				+embeddings = ["awscli (>=1.29.57)", "boto3 (>=1.28.57)", "botocore (>=1.31.57)", "cohere", "google-generativeai", "huggingface-hub", "instructorembedding", "open-clip-torch", "openai (>=1.6.1)", "pillow", "sentence-transformers", "torch"]
			
 
				+tests = ["aiohttp", "boto3", "duckdb", "pandas (>=1.4)", "polars (>=0.19)", "pytest", "pytest-asyncio", "pytest-mock", "pytz", "tantivy"]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "langchain"
			
 
				 version = "0.1.20"
			
@@ -4781,6 +4842,65 @@ all = ["apache-bookkeeper-client (>=4.16.1)", "fastavro (>=1.9.2)", "grpcio (>=1
 
				 avro = ["fastavro (>=1.9.2)"]
			
 
				 functions = ["apache-bookkeeper-client (>=4.16.1)", "grpcio (>=1.60.0)", "prometheus-client", "protobuf (>=3.6.1,<=3.20.3)", "ratelimit"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "py"
			
 
				+version = "1.11.0"
			
 
				+description = "library with cross-python path, ini-parsing, io, code, log facilities"
			
 
				+optional = true
			
 
				+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
			
 
				+files = [
			
 
				+    {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
			
 
				+    {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
			
 
				+]
			
 
				+
			
 
				+[[package]]
			
 
				+name = "pyarrow"
			
 
				+version = "15.0.0"
			
 
				+description = "Python library for Apache Arrow"
			
 
				+optional = true
			
 
				+python-versions = ">=3.8"
			
 
				+files = [
			
 
				+    {file = "pyarrow-15.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:0a524532fd6dd482edaa563b686d754c70417c2f72742a8c990b322d4c03a15d"},
			
 
				+    {file = "pyarrow-15.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:60a6bdb314affa9c2e0d5dddf3d9cbb9ef4a8dddaa68669975287d47ece67642"},
			
 
				+    {file = "pyarrow-15.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:66958fd1771a4d4b754cd385835e66a3ef6b12611e001d4e5edfcef5f30391e2"},
			
 
				+    {file = "pyarrow-15.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f500956a49aadd907eaa21d4fff75f73954605eaa41f61cb94fb008cf2e00c6"},
			
 
				+    {file = "pyarrow-15.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6f87d9c4f09e049c2cade559643424da84c43a35068f2a1c4653dc5b1408a929"},
			
 
				+    {file = "pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:85239b9f93278e130d86c0e6bb455dcb66fc3fd891398b9d45ace8799a871a1e"},
			
 
				+    {file = "pyarrow-15.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:5b8d43e31ca16aa6e12402fcb1e14352d0d809de70edd185c7650fe80e0769e3"},
			
 
				+    {file = "pyarrow-15.0.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:fa7cd198280dbd0c988df525e50e35b5d16873e2cdae2aaaa6363cdb64e3eec5"},
			
 
				+    {file = "pyarrow-15.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8780b1a29d3c8b21ba6b191305a2a607de2e30dab399776ff0aa09131e266340"},
			
 
				+    {file = "pyarrow-15.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe0ec198ccc680f6c92723fadcb97b74f07c45ff3fdec9dd765deb04955ccf19"},
			
 
				+    {file = "pyarrow-15.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:036a7209c235588c2f07477fe75c07e6caced9b7b61bb897c8d4e52c4b5f9555"},
			
 
				+    {file = "pyarrow-15.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:2bd8a0e5296797faf9a3294e9fa2dc67aa7f10ae2207920dbebb785c77e9dbe5"},
			
 
				+    {file = "pyarrow-15.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e8ebed6053dbe76883a822d4e8da36860f479d55a762bd9e70d8494aed87113e"},
			
 
				+    {file = "pyarrow-15.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:17d53a9d1b2b5bd7d5e4cd84d018e2a45bc9baaa68f7e6e3ebed45649900ba99"},
			
 
				+    {file = "pyarrow-15.0.0-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:9950a9c9df24090d3d558b43b97753b8f5867fb8e521f29876aa021c52fda351"},
			
 
				+    {file = "pyarrow-15.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:003d680b5e422d0204e7287bb3fa775b332b3fce2996aa69e9adea23f5c8f970"},
			
 
				+    {file = "pyarrow-15.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f75fce89dad10c95f4bf590b765e3ae98bcc5ba9f6ce75adb828a334e26a3d40"},
			
 
				+    {file = "pyarrow-15.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca9cb0039923bec49b4fe23803807e4ef39576a2bec59c32b11296464623dc2"},
			
 
				+    {file = "pyarrow-15.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:9ed5a78ed29d171d0acc26a305a4b7f83c122d54ff5270810ac23c75813585e4"},
			
 
				+    {file = "pyarrow-15.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6eda9e117f0402dfcd3cd6ec9bfee89ac5071c48fc83a84f3075b60efa96747f"},
			
 
				+    {file = "pyarrow-15.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:9a3a6180c0e8f2727e6f1b1c87c72d3254cac909e609f35f22532e4115461177"},
			
 
				+    {file = "pyarrow-15.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:19a8918045993349b207de72d4576af0191beef03ea655d8bdb13762f0cd6eac"},
			
 
				+    {file = "pyarrow-15.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d0ec076b32bacb6666e8813a22e6e5a7ef1314c8069d4ff345efa6246bc38593"},
			
 
				+    {file = "pyarrow-15.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5db1769e5d0a77eb92344c7382d6543bea1164cca3704f84aa44e26c67e320fb"},
			
 
				+    {file = "pyarrow-15.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2617e3bf9df2a00020dd1c1c6dce5cc343d979efe10bc401c0632b0eef6ef5b"},
			
 
				+    {file = "pyarrow-15.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:d31c1d45060180131caf10f0f698e3a782db333a422038bf7fe01dace18b3a31"},
			
 
				+    {file = "pyarrow-15.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:c8c287d1d479de8269398b34282e206844abb3208224dbdd7166d580804674b7"},
			
 
				+    {file = "pyarrow-15.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:07eb7f07dc9ecbb8dace0f58f009d3a29ee58682fcdc91337dfeb51ea618a75b"},
			
 
				+    {file = "pyarrow-15.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:47af7036f64fce990bb8a5948c04722e4e3ea3e13b1007ef52dfe0aa8f23cf7f"},
			
 
				+    {file = "pyarrow-15.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93768ccfff85cf044c418bfeeafce9a8bb0cee091bd8fd19011aff91e58de540"},
			
 
				+    {file = "pyarrow-15.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6ee87fd6892700960d90abb7b17a72a5abb3b64ee0fe8db6c782bcc2d0dc0b4"},
			
 
				+    {file = "pyarrow-15.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:001fca027738c5f6be0b7a3159cc7ba16a5c52486db18160909a0831b063c4e4"},
			
 
				+    {file = "pyarrow-15.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:d1c48648f64aec09accf44140dccb92f4f94394b8d79976c426a5b79b11d4fa7"},
			
 
				+    {file = "pyarrow-15.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:972a0141be402bb18e3201448c8ae62958c9c7923dfaa3b3d4530c835ac81aed"},
			
 
				+    {file = "pyarrow-15.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:f01fc5cf49081426429127aa2d427d9d98e1cb94a32cb961d583a70b7c4504e6"},
			
 
				+    {file = "pyarrow-15.0.0.tar.gz", hash = "sha256:876858f549d540898f927eba4ef77cd549ad8d24baa3207cf1b72e5788b50e83"},
			
 
				+]
			
 
				+
			
 
				+[package.dependencies]
			
 
				+numpy = ">=1.16.6,<2"
			
 
				+
			
 
				 [[package]]
			
 
				 name = "pyasn1"
			
 
				 version = "0.6.0"
			
@@ -5019,6 +5139,32 @@ dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pyte
 
				 docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"]
			
 
				 tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "pylance"
			
 
				+version = "0.10.12"
			
 
				+description = "python wrapper for Lance columnar format"
			
 
				+optional = true
			
 
				+python-versions = ">=3.8"
			
 
				+files = [
			
 
				+    {file = "pylance-0.10.12-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:30cbcca078edeb37e11ae86cf9287d81ce6c0c07ba77239284b369a4b361497b"},
			
 
				+    {file = "pylance-0.10.12-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:e558163ff6035d518706cc66848497219ccc755e2972b8f3b1706a3e1fd800fd"},
			
 
				+    {file = "pylance-0.10.12-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75afb39f71d7f12429f9b4d380eb6cf6aed179ae5a1c5d16cc768373a1521f87"},
			
 
				+    {file = "pylance-0.10.12-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:3de391dfc3a99bdb245fd1e27ef242be769a94853f802ef57f246e9a21358d32"},
			
 
				+    {file = "pylance-0.10.12-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:34a5278b90f4cbcf21261353976127aa2ffbbd7d068810f0a2b0c1aa0334022a"},
			
 
				+    {file = "pylance-0.10.12-cp38-abi3-win_amd64.whl", hash = "sha256:6cef5975d513097fd2c22692296c9a5a138928f38d02cd34ab63a7369abc1463"},
			
 
				+]
			
 
				+
			
 
				+[package.dependencies]
			
 
				+numpy = ">=1.22"
			
 
				+pyarrow = ">=12,<15.0.1"
			
 
				+
			
 
				+[package.extras]
			
 
				+benchmarks = ["pytest-benchmark"]
			
 
				+dev = ["ruff (==0.2.2)"]
			
 
				+ray = ["ray[data]"]
			
 
				+tests = ["boto3", "datasets", "duckdb", "h5py (<3.11)", "ml-dtypes", "pandas", "pillow", "polars[pandas,pyarrow]", "pytest", "tensorflow", "tqdm"]
			
 
				+torch = ["torch"]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "pymilvus"
			
 
				 version = "2.4.3"
			
@@ -5540,6 +5686,20 @@ urllib3 = ">=1.26.14,<3"
 
				 [package.extras]
			
 
				 fastembed = ["fastembed (==0.2.6)"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "ratelimiter"
			
 
				+version = "1.2.0.post0"
			
 
				+description = "Simple python rate limiting object"
			
 
				+optional = true
			
 
				+python-versions = "*"
			
 
				+files = [
			
 
				+    {file = "ratelimiter-1.2.0.post0-py3-none-any.whl", hash = "sha256:a52be07bc0bb0b3674b4b304550f10c769bbb00fead3072e035904474259809f"},
			
 
				+    {file = "ratelimiter-1.2.0.post0.tar.gz", hash = "sha256:5c395dcabdbbde2e5178ef3f89b568a3066454a6ddc223b76473dac22f89b4f7"},
			
 
				+]
			
 
				+
			
 
				+[package.extras]
			
 
				+test = ["pytest (>=3.0)", "pytest-asyncio"]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "regex"
			
 
				 version = "2024.5.15"
			
@@ -5720,6 +5880,21 @@ urllib3 = ">=1.25.10,<3.0"
 
				 [package.extras]
			
 
				 tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli", "tomli-w", "types-requests"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "retry"
			
 
				+version = "0.9.2"
			
 
				+description = "Easy to use retry decorator."
			
 
				+optional = true
			
 
				+python-versions = "*"
			
 
				+files = [
			
 
				+    {file = "retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606"},
			
 
				+    {file = "retry-0.9.2.tar.gz", hash = "sha256:f8bfa8b99b69c4506d6f5bd3b0aabf77f98cdb17f3c9fc3f5ca820033336fba4"},
			
 
				+]
			
 
				+
			
 
				+[package.dependencies]
			
 
				+decorator = ">=3.4.2"
			
 
				+py = ">=1.4.26,<2.0.0"
			
 
				+
			
 
				 [[package]]
			
 
				 name = "rich"
			
 
				 version = "13.7.1"
			
@@ -6018,6 +6193,17 @@ dev = ["cython-lint (>=0.12.2)", "doit (>=0.36.0)", "mypy", "pycodestyle", "pyde
 
				 doc = ["jupyterlite-pyodide-kernel", "jupyterlite-sphinx (>=0.12.0)", "jupytext", "matplotlib (>=3.5)", "myst-nb", "numpydoc", "pooch", "pydata-sphinx-theme (>=0.15.2)", "sphinx (>=5.0.0)", "sphinx-design (>=0.4.0)"]
			
 
				 test = ["array-api-strict", "asv", "gmpy2", "hypothesis (>=6.30)", "mpmath", "pooch", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "scikit-umfpack", "threadpoolctl"]
			
 
				 
			
 
				+[[package]]
			
 
				+name = "semver"
			
 
				+version = "3.0.2"
			
 
				+description = "Python helper for Semantic Versioning (https://semver.org)"
			
 
				+optional = true
			
 
				+python-versions = ">=3.7"
			
 
				+files = [
			
 
				+    {file = "semver-3.0.2-py3-none-any.whl", hash = "sha256:b1ea4686fe70b981f85359eda33199d60c53964284e0cfb4977d243e37cf4bf4"},
			
 
				+    {file = "semver-3.0.2.tar.gz", hash = "sha256:6253adb39c70f6e51afed2fa7152bcd414c411286088fb4b9effb133885ab4cc"},
			
 
				+]
			
 
				+
			
 
				 [[package]]
			
 
				 name = "sentence-transformers"
			
 
				 version = "2.7.0"
			
@@ -7716,6 +7902,7 @@ gmail = ["google-api-core", "google-api-python-client", "google-auth", "google-a
 
				 google = ["google-generativeai"]
			
 
				 googledrive = ["google-api-python-client", "google-auth-httplib2", "google-auth-oauthlib"]
			
 
				 huggingface-hub = ["huggingface_hub"]
			
 
				+lancedb = ["lancedb"]
			
 
				 llama2 = ["replicate"]
			
 
				 milvus = ["pymilvus"]
			
 
				 mistralai = ["langchain-mistralai"]
			
@@ -7738,4 +7925,4 @@ youtube = ["youtube-transcript-api", "yt_dlp"]
 
				 [metadata]
			
 
				 lock-version = "2.0"
			
 
				 python-versions = ">=3.9,<=3.13"
			
 
				-content-hash = "cb7d2794bc2f54e05b2f870843eccc4342f3f2a6531eaa50a3d0d77b358ac4d5"
			
 
				+content-hash = "f9e6357bd1b5f407368d3d52c3f728e12f41fe7d4836e321ec2e10413f58e8a1"
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -122,6 +122,7 @@ slack-sdk = { version = "3.21.3", optional = true }
 
				 clarifai = { version = "^10.0.1", optional = true }
			
 
				 cohere = { version = "^5.3", optional = true }
			
 
				 together = { version = "^0.2.8", optional = true }
			
 
				+lancedb = { version = "^0.6.2", optional = true }
			
 
				 weaviate-client = { version = "^3.24.1", optional = true }
			
 
				 docx2txt = { version = "^0.8", optional = true }
			
 
				 qdrant-client = { version = "^1.6.3", optional = true }
			
@@ -173,6 +174,7 @@ pytest-asyncio = "^0.21.1"
 
				 [tool.poetry.extras]
			
 
				 streamlit = ["streamlit"]
			
 
				 opensource = ["sentence-transformers", "torch", "gpt4all"]
			
 
				+lancedb = ["lancedb"]
			
 
				 elasticsearch = ["elasticsearch"]
			
 
				 opensearch = ["opensearch-py"]
			
 
				 poe = ["fastapi-poe"]
			
--- a/tests/chunkers/test_chunkers.py
+++ b/tests/chunkers/test_chunkers.py
@@ -1,3 +1,4 @@
 
				+from embedchain.chunkers.audio import AudioChunker
			
 
				 from embedchain.chunkers.common_chunker import CommonChunker
			
 
				 from embedchain.chunkers.discourse import DiscourseChunker
			
 
				 from embedchain.chunkers.docs_site import DocsSiteChunker
			
@@ -19,7 +20,6 @@ from embedchain.chunkers.text import TextChunker
 
				 from embedchain.chunkers.web_page import WebPageChunker
			
 
				 from embedchain.chunkers.xml import XmlChunker
			
 
				 from embedchain.chunkers.youtube_video import YoutubeVideoChunker
			
 
				-from embedchain.chunkers.audio import AudioChunker
			
 
				 from embedchain.config.add_config import ChunkerConfig
			
 
				 
			
 
				 chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
			
--- a/tests/loaders/test_audio.py
+++ b/tests/loaders/test_audio.py
@@ -1,11 +1,13 @@
 
				+import hashlib
			
 
				 import os
			
 
				 import sys
			
 
				-import hashlib
			
 
				-import pytest
			
 
				 from unittest.mock import mock_open, patch
			
 
				 
			
 
				+import pytest
			
 
				+
			
 
				 if sys.version_info > (3, 10):  # as `match` statement was introduced in python 3.10
			
 
				     from deepgram import PrerecordedOptions
			
 
				+
			
 
				     from embedchain.loaders.audio import AudioLoader
			
 
				 
			
 
				 
			
--- a/tests/vectordb/test_lancedb.py
+++ b/tests/vectordb/test_lancedb.py
@@ -0,0 +1,215 @@
 
				+import os
			
 
				+import shutil
			
 
				+
			
 
				+import pytest
			
 
				+
			
 
				+from embedchain import App
			
 
				+from embedchain.config import AppConfig
			
 
				+from embedchain.config.vectordb.lancedb import LanceDBConfig
			
 
				+from embedchain.vectordb.lancedb import LanceDB
			
 
				+
			
 
				+os.environ["OPENAI_API_KEY"] = "test-api-key"
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def lancedb():
			
 
				+    return LanceDB(config=LanceDBConfig(dir="test-db", collection_name="test-coll"))
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def app_with_settings():
			
 
				+    lancedb_config = LanceDBConfig(allow_reset=True, dir="test-db-reset")
			
 
				+    lancedb = LanceDB(config=lancedb_config)
			
 
				+    app_config = AppConfig(collect_metrics=False)
			
 
				+    return App(config=app_config, db=lancedb)
			
 
				+
			
 
				+
			
 
				+@pytest.fixture(scope="session", autouse=True)
			
 
				+def cleanup_db():
			
 
				+    yield
			
 
				+    try:
			
 
				+        shutil.rmtree("test-db.lance")
			
 
				+        shutil.rmtree("test-db-reset.lance")
			
 
				+    except OSError as e:
			
 
				+        print("Error: %s - %s." % (e.filename, e.strerror))
			
 
				+
			
 
				+
			
 
				+def test_lancedb_duplicates_throw_warning(caplog):
			
 
				+    db = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app = App(config=AppConfig(collect_metrics=False), db=db)
			
 
				+    app.db.add(ids=["0"], documents=["doc1"], metadatas=["test"])
			
 
				+    app.db.add(ids=["0"], documents=["doc1"], metadatas=["test"])
			
 
				+    assert "Insert of existing doc ID: 0" not in caplog.text
			
 
				+    assert "Add of existing doc ID: 0" not in caplog.text
			
 
				+    app.db.reset()
			
 
				+
			
 
				+
			
 
				+def test_lancedb_duplicates_collections_no_warning(caplog):
			
 
				+    db = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app = App(config=AppConfig(collect_metrics=False), db=db)
			
 
				+    app.set_collection_name("test_collection_1")
			
 
				+    app.db.add(ids=["0"], documents=["doc1"], metadatas=["test"])
			
 
				+    app.set_collection_name("test_collection_2")
			
 
				+    app.db.add(ids=["0"], documents=["doc1"], metadatas=["test"])
			
 
				+    assert "Insert of existing doc ID: 0" not in caplog.text
			
 
				+    assert "Add of existing doc ID: 0" not in caplog.text
			
 
				+    app.db.reset()
			
 
				+    app.set_collection_name("test_collection_1")
			
 
				+    app.db.reset()
			
 
				+
			
 
				+
			
 
				+def test_lancedb_collection_init_with_default_collection():
			
 
				+    db = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app = App(config=AppConfig(collect_metrics=False), db=db)
			
 
				+    assert app.db.collection.name == "embedchain_store"
			
 
				+
			
 
				+
			
 
				+def test_lancedb_collection_init_with_custom_collection():
			
 
				+    db = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app = App(config=AppConfig(collect_metrics=False), db=db)
			
 
				+    app.set_collection_name(name="test_collection")
			
 
				+    assert app.db.collection.name == "test_collection"
			
 
				+
			
 
				+
			
 
				+def test_lancedb_collection_set_collection_name():
			
 
				+    db = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app = App(config=AppConfig(collect_metrics=False), db=db)
			
 
				+    app.set_collection_name("test_collection")
			
 
				+    assert app.db.collection.name == "test_collection"
			
 
				+
			
 
				+
			
 
				+def test_lancedb_collection_changes_encapsulated():
			
 
				+    db = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app = App(config=AppConfig(collect_metrics=False), db=db)
			
 
				+    app.set_collection_name("test_collection_1")
			
 
				+    assert app.db.count() == 0
			
 
				+    app.db.add(ids=["0"], documents=["doc1"], metadatas=["test"])
			
 
				+    assert app.db.count() == 1
			
 
				+
			
 
				+    app.set_collection_name("test_collection_2")
			
 
				+    assert app.db.count() == 0
			
 
				+
			
 
				+    app.db.add(ids=["0"], documents=["doc1"], metadatas=["test"])
			
 
				+    app.set_collection_name("test_collection_1")
			
 
				+    assert app.db.count() == 1
			
 
				+    app.db.reset()
			
 
				+    app.set_collection_name("test_collection_2")
			
 
				+    app.db.reset()
			
 
				+
			
 
				+
			
 
				+def test_lancedb_collection_collections_are_persistent():
			
 
				+    db = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app = App(config=AppConfig(collect_metrics=False), db=db)
			
 
				+    app.set_collection_name("test_collection_1")
			
 
				+    app.db.add(ids=["0"], documents=["doc1"], metadatas=["test"])
			
 
				+    del app
			
 
				+
			
 
				+    db = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app = App(config=AppConfig(collect_metrics=False), db=db)
			
 
				+    app.set_collection_name("test_collection_1")
			
 
				+    assert app.db.count() == 1
			
 
				+
			
 
				+    app.db.reset()
			
 
				+
			
 
				+
			
 
				+def test_lancedb_collection_parallel_collections():
			
 
				+    db1 = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db", collection_name="test_collection_1"))
			
 
				+    app1 = App(
			
 
				+        config=AppConfig(collect_metrics=False),
			
 
				+        db=db1,
			
 
				+    )
			
 
				+    db2 = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db", collection_name="test_collection_2"))
			
 
				+    app2 = App(
			
 
				+        config=AppConfig(collect_metrics=False),
			
 
				+        db=db2,
			
 
				+    )
			
 
				+
			
 
				+    # cleanup if any previous tests failed or were interrupted
			
 
				+    app1.db.reset()
			
 
				+    app2.db.reset()
			
 
				+
			
 
				+    app1.db.add(ids=["0"], documents=["doc1"], metadatas=["test"])
			
 
				+
			
 
				+    assert app1.db.count() == 1
			
 
				+    assert app2.db.count() == 0
			
 
				+
			
 
				+    app1.db.add(ids=["1", "2"], documents=["doc1", "doc2"], metadatas=["test", "test"])
			
 
				+    app2.db.add(ids=["0"], documents=["doc1"], metadatas=["test"])
			
 
				+
			
 
				+    app1.set_collection_name("test_collection_2")
			
 
				+    assert app1.db.count() == 1
			
 
				+    app2.set_collection_name("test_collection_1")
			
 
				+    assert app2.db.count() == 3
			
 
				+
			
 
				+    # cleanup
			
 
				+    app1.db.reset()
			
 
				+    app2.db.reset()
			
 
				+
			
 
				+
			
 
				+def test_lancedb_collection_ids_share_collections():
			
 
				+    db1 = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app1 = App(config=AppConfig(collect_metrics=False), db=db1)
			
 
				+    app1.set_collection_name("one_collection")
			
 
				+    db2 = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app2 = App(config=AppConfig(collect_metrics=False), db=db2)
			
 
				+    app2.set_collection_name("one_collection")
			
 
				+
			
 
				+    # cleanup
			
 
				+    app1.db.reset()
			
 
				+    app2.db.reset()
			
 
				+
			
 
				+    app1.db.add(ids=["0", "1"], documents=["doc1", "doc2"], metadatas=["test", "test"])
			
 
				+    app2.db.add(ids=["2"], documents=["doc3"], metadatas=["test"])
			
 
				+
			
 
				+    assert app1.db.count() == 2
			
 
				+    assert app2.db.count() == 3
			
 
				+
			
 
				+    # cleanup
			
 
				+    app1.db.reset()
			
 
				+    app2.db.reset()
			
 
				+
			
 
				+
			
 
				+def test_lancedb_collection_reset():
			
 
				+    db1 = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app1 = App(config=AppConfig(collect_metrics=False), db=db1)
			
 
				+    app1.set_collection_name("one_collection")
			
 
				+    db2 = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app2 = App(config=AppConfig(collect_metrics=False), db=db2)
			
 
				+    app2.set_collection_name("two_collection")
			
 
				+    db3 = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app3 = App(config=AppConfig(collect_metrics=False), db=db3)
			
 
				+    app3.set_collection_name("three_collection")
			
 
				+    db4 = LanceDB(config=LanceDBConfig(allow_reset=True, dir="test-db"))
			
 
				+    app4 = App(config=AppConfig(collect_metrics=False), db=db4)
			
 
				+    app4.set_collection_name("four_collection")
			
 
				+
			
 
				+    # cleanup if any previous tests failed or were interrupted
			
 
				+    app1.db.reset()
			
 
				+    app2.db.reset()
			
 
				+    app3.db.reset()
			
 
				+    app4.db.reset()
			
 
				+
			
 
				+    app1.db.add(ids=["1"], documents=["doc1"], metadatas=["test"])
			
 
				+    app2.db.add(ids=["2"], documents=["doc2"], metadatas=["test"])
			
 
				+    app3.db.add(ids=["3"], documents=["doc3"], metadatas=["test"])
			
 
				+    app4.db.add(ids=["4"], documents=["doc4"], metadatas=["test"])
			
 
				+
			
 
				+    app1.db.reset()
			
 
				+
			
 
				+    assert app1.db.count() == 0
			
 
				+    assert app2.db.count() == 1
			
 
				+    assert app3.db.count() == 1
			
 
				+    assert app4.db.count() == 1
			
 
				+
			
 
				+    # cleanup
			
 
				+    app2.db.reset()
			
 
				+    app3.db.reset()
			
 
				+    app4.db.reset()
			
 
				+
			
 
				+
			
 
				+def generate_embeddings(dummy_embed, embed_size):
			
 
				+    generated_embedding = []
			
 
				+    for i in range(embed_size):
			
 
				+        generated_embedding.append(dummy_embed)
			
 
				+
			
 
				+    return generated_embedding