xzc
/
mem0


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
							import logging
from typing import Dict, List, Optional, Tuple

from embedchain.config import ZillizDBConfig
from embedchain.helper.json_serializable import register_deserializable
from embedchain.vectordb.base import BaseVectorDB

try:
    from pymilvus import (Collection, CollectionSchema, DataType, FieldSchema,
                          MilvusClient, connections, utility)
except ImportError:
    raise ImportError(
        "Zilliz requires extra dependencies. Install with `pip install --upgrade embedchain[milvus]`"
    ) from None


@register_deserializable
class ZillizVectorDB(BaseVectorDB):
    """Base class for vector database."""

    def __init__(self, config: ZillizDBConfig = None):
        """Initialize the database. Save the config and client as an attribute.

        :param config: Database configuration class instance.
        :type config: ZillizDBConfig
        """

        if config is None:
            self.config = ZillizDBConfig()
        else:
            self.config = config

        self.client = MilvusClient(
            uri=self.config.uri,
            token=self.config.token,
        )

        self.connection = connections.connect(
            uri=self.config.uri,
            token=self.config.token,
        )

        super().__init__(config=self.config)

    def _initialize(self):
        """
        This method is needed because `embedder` attribute needs to be set externally before it can be initialized.

        So it's can't be done in __init__ in one step.
        """
        self._get_or_create_collection(self.config.collection_name)

    def _get_or_create_db(self):
        """Get or create the database."""
        return self.client

    def _get_or_create_collection(self, name):
        """
        Get or create a named collection.

        :param name: Name of the collection
        :type name: str
        """
        if utility.has_collection(name):
            logging.info(f"[ZillizDB]: found an existing collection {name}, make sure the auto-id is disabled.")
            self.collection = Collection(name)
        else:
            fields = [
                FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=512),
                FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2048),
                FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=self.embedder.vector_dimension),
            ]

            schema = CollectionSchema(fields, enable_dynamic_field=True)
            self.collection = Collection(name=name, schema=schema)

            index = {
                "index_type": "AUTOINDEX",
                "metric_type": self.config.metric_type,
            }
            self.collection.create_index("embeddings", index)
        return self.collection

    def get(self, ids: Optional[List[str]] = None, where: Optional[Dict[str, any]] = None, limit: Optional[int] = None):
        """
        Get existing doc ids present in vector database

        :param ids: list of doc ids to check for existence
        :type ids: List[str]
        :param where: Optional. to filter data
        :type where: Dict[str, Any]
        :param limit: Optional. maximum number of documents
        :type limit: Optional[int]
        :return: Existing documents.
        :rtype: Set[str]
        """
        if ids is None or len(ids) == 0 or self.collection.num_entities == 0:
            return {"ids": []}

        if not (self.collection.is_empty):
            filter = f"id in {ids}"
            results = self.client.query(
                collection_name=self.config.collection_name, filter=filter, output_fields=["id"]
            )
            results = [res["id"] for res in results]

        return {"ids": set(results)}

    def add(
        self,
        embeddings: List[List[float]],
        documents: List[str],
        metadatas: List[object],
        ids: List[str],
        skip_embedding: bool,
    ):
        """Add to database"""
        if not skip_embedding:
            embeddings = self.embedder.embedding_fn(documents)

        for id, doc, metadata, embedding in zip(ids, documents, metadatas, embeddings):
            data = {**metadata, "id": id, "text": doc, "embeddings": embedding}
            self.client.insert(collection_name=self.config.collection_name, data=data)

        self.collection.load()
        self.collection.flush()
        self.client.flush(self.config.collection_name)

    def query(
        self, input_query: List[str], n_results: int, where: Dict[str, any], skip_embedding: bool
    ) -> List[Tuple[str, str, str]]:
        """
        Query contents from vector data base based on vector similarity

        :param input_query: list of query string
        :type input_query: List[str]
        :param n_results: no of similar documents to fetch from database
        :type n_results: int
        :param where: to filter data
        :type where: str
        :raises InvalidDimensionException: Dimensions do not match.
        :return: The context of the document that matched your query, url of the source, doc_id
        :rtype: List[Tuple[str,str,str]]
        """

        if self.collection.is_empty:
            return []

        if not isinstance(where, str):
            where = None

        output_fields = ["text", "url", "doc_id"]
        if skip_embedding:
            query_vector = input_query
            query_result = self.client.search(
                collection_name=self.config.collection_name,
                data=query_vector,
                limit=n_results,
                output_fields=output_fields,
            )

        else:
            input_query_vector = self.embedder.embedding_fn([input_query])
            query_vector = input_query_vector[0]

            query_result = self.client.search(
                collection_name=self.config.collection_name,
                data=[query_vector],
                limit=n_results,
                output_fields=output_fields,
            )

        doc_list = []
        for query in query_result:
            data = query[0]["entity"]
            context = data["text"]
            source = data["url"]
            doc_id = data["doc_id"]
            doc_list.append(tuple((context, source, doc_id)))
        return doc_list

    def count(self) -> int:
        """
        Count number of documents/chunks embedded in the database.

        :return: number of documents
        :rtype: int
        """
        return self.collection.num_entities

    def reset(self, collection_names: List[str] = None):
        """
        Resets the database. Deletes all embeddings irreversibly.
        """
        if self.config.collection_name:
            if collection_names:
                for collection_name in collection_names:
                    if collection_name in self.client.list_collections():
                        self.client.drop_collection(collection_name=collection_name)
            else:
                self.client.drop_collection(collection_name=self.config.collection_name)
                self._get_or_create_collection(self.config.collection_name)

    def set_collection_name(self, name: str):
        """
        Set the name of the collection. A collection is an isolated space for vectors.

        :param name: Name of the collection.
        :type name: str
        """
        if not isinstance(name, str):
            raise TypeError("Collection name must be a string")
        self.config.collection_name = name