瀏覽代碼

feat: anonymous telemetry (#423)

cachho 2 年之前
父節點
當前提交
163f437582

+ 1 - 0
docs/advanced/query_configuration.mdx

@@ -10,6 +10,7 @@ title: '🔍 Query configurations'
 | embedding_fn| embedding function    | chromadb.utils.embedding_functions | \{text-embedding-ada-002\} |
 | db        | vector database (experimental) | BaseVectorDB               | ChromaDB               |
 | collection_name | initial collection name for the database | string             | embedchain_store |
+| collection_metrics | collect anonymous telemetry data to improve embedchain | boolean | true |
 
 
 ## AddConfig

+ 12 - 1
embedchain/config/apps/AppConfig.py

@@ -1,4 +1,5 @@
 import os
+from typing import Optional
 
 try:
     from chromadb.utils import embedding_functions
@@ -16,7 +17,15 @@ class AppConfig(BaseAppConfig):
     Config to initialize an embedchain custom `App` instance, with extra config options.
     """
 
-    def __init__(self, log_level=None, host=None, port=None, id=None, collection_name=None):
+    def __init__(
+        self,
+        log_level=None,
+        host=None,
+        port=None,
+        id=None,
+        collection_name=None,
+        collect_metrics: Optional[bool] = None,
+    ):
         """
         :param log_level: Optional. (String) Debug level
         ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
@@ -24,6 +33,7 @@ class AppConfig(BaseAppConfig):
         :param port: Optional. Port for the database server.
         :param id: Optional. ID of the app. Document metadata will have this id.
         :param collection_name: Optional. Collection name for the database.
+        :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
         """
         super().__init__(
             log_level=log_level,
@@ -32,6 +42,7 @@ class AppConfig(BaseAppConfig):
             port=port,
             id=id,
             collection_name=collection_name,
+            collect_metrics=collect_metrics,
         )
 
     @staticmethod

+ 3 - 0
embedchain/config/apps/BaseAppConfig.py

@@ -19,6 +19,7 @@ class BaseAppConfig(BaseConfig):
         port=None,
         id=None,
         collection_name=None,
+        collect_metrics: bool = True,
         db_type: VectorDatabases = None,
         vector_dim: VectorDimensions = None,
         es_config: ElasticsearchDBConfig = None,
@@ -32,6 +33,7 @@ class BaseAppConfig(BaseConfig):
         :param port: Optional. Port for the database server.
         :param id: Optional. ID of the app. Document metadata will have this id.
         :param collection_name: Optional. Collection name for the database.
+        :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
         :param db_type: Optional. type of Vector database to use
         :param vector_dim: Vector dimension generated by embedding fn
         :param es_config: Optional. elasticsearch database config to be used for connection
@@ -49,6 +51,7 @@ class BaseAppConfig(BaseConfig):
             es_config=es_config,
         )
         self.id = id
+        self.collect_metrics = True if (collect_metrics is True or collect_metrics is None) else False
         return
 
     @staticmethod

+ 4 - 1
embedchain/config/apps/CustomAppConfig.py

@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, Optional
 
 from chromadb.api.types import Documents, Embeddings
 from dotenv import load_dotenv
@@ -30,6 +30,7 @@ class CustomAppConfig(BaseAppConfig):
         provider: Providers = None,
         open_source_app_config=None,
         deployment_name=None,
+        collect_metrics: Optional[bool] = None,
         db_type: VectorDatabases = None,
         es_config: ElasticsearchDBConfig = None,
     ):
@@ -45,6 +46,7 @@ class CustomAppConfig(BaseAppConfig):
         :param collection_name: Optional. Collection name for the database.
         :param provider: Optional. (Providers): LLM Provider to use.
         :param open_source_app_config: Optional. Config instance needed for open source apps.
+        :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
         :param db_type: Optional. type of Vector database to use.
         :param es_config: Optional. elasticsearch database config to be used for connection
         """
@@ -65,6 +67,7 @@ class CustomAppConfig(BaseAppConfig):
             port=port,
             id=id,
             collection_name=collection_name,
+            collect_metrics=collect_metrics,
             db_type=db_type,
             vector_dim=CustomAppConfig.get_vector_dimension(embedding_function=embedding_fn),
             es_config=es_config,

+ 14 - 1
embedchain/config/apps/OpenSourceAppConfig.py

@@ -1,3 +1,5 @@
+from typing import Optional
+
 from chromadb.utils import embedding_functions
 
 from .BaseAppConfig import BaseAppConfig
@@ -8,7 +10,16 @@ class OpenSourceAppConfig(BaseAppConfig):
     Config to initialize an embedchain custom `OpenSourceApp` instance, with extra config options.
     """
 
-    def __init__(self, log_level=None, host=None, port=None, id=None, collection_name=None, model=None):
+    def __init__(
+        self,
+        log_level=None,
+        host=None,
+        port=None,
+        id=None,
+        collection_name=None,
+        collect_metrics: Optional[bool] = None,
+        model=None,
+    ):
         """
         :param log_level: Optional. (String) Debug level
         ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'].
@@ -16,6 +27,7 @@ class OpenSourceAppConfig(BaseAppConfig):
         :param collection_name: Optional. Collection name for the database.
         :param host: Optional. Hostname for the database server.
         :param port: Optional. Port for the database server.
+        :param collect_metrics: Defaults to True. Send anonymous telemetry to improve embedchain.
         :param model: Optional. GPT4ALL uses the model to instantiate the class.
         So unlike `App`, it has to be provided before querying.
         """
@@ -28,6 +40,7 @@ class OpenSourceAppConfig(BaseAppConfig):
             port=port,
             id=id,
             collection_name=collection_name,
+            collect_metrics=collect_metrics,
         )
 
     @staticmethod

+ 60 - 5
embedchain/embedchain.py

@@ -1,9 +1,14 @@
+import importlib.metadata
 import logging
 import os
+import threading
+from typing import Optional
 
+import requests
 from dotenv import load_dotenv
 from langchain.docstore.document import Document
 from langchain.memory import ConversationBufferMemory
+from tenacity import retry, stop_after_attempt, wait_fixed
 
 from embedchain.chunkers.base_chunker import BaseChunker
 from embedchain.config import AddConfig, ChatConfig, QueryConfig
@@ -36,6 +41,10 @@ class EmbedChain:
         self.is_docs_site_instance = False
         self.online = False
 
+        # Send anonymous telemetry
+        thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("init",))
+        thread_telemetry.start()
+
     def add(self, data_type, url, metadata=None, config: AddConfig = None):
         """
         Adds the data from the given URL to the vector db.
@@ -53,10 +62,21 @@ class EmbedChain:
 
         data_formatter = DataFormatter(data_type, config)
         self.user_asks.append([data_type, url, metadata])
-        self.load_and_embed(data_formatter.loader, data_formatter.chunker, url, metadata)
+        documents, _metadatas, _ids, new_chunks = self.load_and_embed(
+            data_formatter.loader, data_formatter.chunker, url, metadata
+        )
         if data_type in ("docs_site",):
             self.is_docs_site_instance = True
 
+        # Send anonymous telemetry
+        if self.config.collect_metrics:
+            # it's quicker to check the variable twice than to count words when they won't be submitted.
+            word_count = sum([len(document.split(" ")) for document in documents])
+
+            extra_metadata = {"data_type": data_type, "word_count": word_count, "chunks_count": new_chunks}
+            thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("add", extra_metadata))
+            thread_telemetry.start()
+
     def add_local(self, data_type, content, metadata=None, config: AddConfig = None):
         """
         Adds the data you supply to the vector db.
@@ -90,6 +110,7 @@ class EmbedChain:
         :param src: The data to be handled by the loader. Can be a URL for
         remote sources or local content for local loaders.
         :param metadata: Optional. Metadata associated with the data source.
+        :return: (List) documents (embedded text), (List) metadata, (list) ids, (int) number of chunks
         """
         embeddings_data = chunker.create_chunks(loader, src)
         documents = embeddings_data["documents"]
@@ -109,7 +130,8 @@ class EmbedChain:
 
             if not data_dict:
                 print(f"All data from {src} already exists in the database.")
-                return
+                # Make sure to return a matching return type
+                return [], [], [], 0
 
             ids = list(data_dict.keys())
             documents, metadatas = zip(*data_dict.values())
@@ -126,8 +148,10 @@ class EmbedChain:
         # Add metadata to each document
         metadatas_with_metadata = [{**meta, **metadata} for meta in metadatas]
 
-        self.db.add(documents=documents, metadatas=list(metadatas_with_metadata), ids=ids)
-        print((f"Successfully saved {src}. New chunks count: " f"{self.count() - chunks_before_addition}"))
+        self.db.add(documents=documents, metadatas=metadatas_with_metadata, ids=ids)
+        count_new_chunks = self.count() - chunks_before_addition
+        print((f"Successfully saved {src}. New chunks count: {count_new_chunks}"))
+        return list(documents), metadatas_with_metadata, ids, count_new_chunks
 
     def _format_result(self, results):
         return [
@@ -240,6 +264,10 @@ class EmbedChain:
 
         answer = self.get_answer_from_llm(prompt, config)
 
+        # Send anonymous telemetry
+        thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("query",))
+        thread_telemetry.start()
+
         if isinstance(answer, str):
             logging.info(f"Answer: {answer}")
             return answer
@@ -297,6 +325,10 @@ class EmbedChain:
 
         memory.chat_memory.add_user_message(input_query)
 
+        # Send anonymous telemetry
+        thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("chat",))
+        thread_telemetry.start()
+
         if isinstance(answer, str):
             memory.chat_memory.add_ai_message(answer)
             logging.info(f"Answer: {answer}")
@@ -321,7 +353,7 @@ class EmbedChain:
         """
         self.collection = self.config.db._get_or_create_collection(collection_name)
 
-    def count(self):
+    def count(self) -> int:
         """
         Count the number of embeddings.
 
@@ -334,4 +366,27 @@ class EmbedChain:
         Resets the database. Deletes all embeddings irreversibly.
         `App` has to be reinitialized after using this method.
         """
+        # Send anonymous telemetry
+        thread_telemetry = threading.Thread(target=self._send_telemetry_event, args=("reset",))
+        thread_telemetry.start()
+
         self.db.reset()
+
+    @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
+    def _send_telemetry_event(self, method: str, extra_metadata: Optional[dict] = None):
+        if not self.config.collect_metrics:
+            return
+
+        with threading.Lock():
+            url = "https://api.embedchain.ai/api/v1/telemetry/"
+            metadata = {
+                "app_id": self.config.id,
+                "version": importlib.metadata.version(__package__ or __name__),
+                "method": method,
+                "language": "py",
+            }
+            if extra_metadata:
+                metadata.update(extra_metadata)
+
+            response = requests.post(url, json={"metadata": metadata})
+            response.raise_for_status()

+ 2 - 1
tests/embedchain/test_add.py

@@ -3,13 +3,14 @@ import unittest
 from unittest.mock import MagicMock, patch
 
 from embedchain import App
+from embedchain.config import AppConfig
 
 
 class TestApp(unittest.TestCase):
     os.environ["OPENAI_API_KEY"] = "test_key"
 
     def setUp(self):
-        self.app = App()
+        self.app = App(config=AppConfig(collect_metrics=False))
 
     @patch("chromadb.api.models.Collection.Collection.add", MagicMock)
     def test_add(self):

+ 2 - 1
tests/embedchain/test_chat.py

@@ -3,13 +3,14 @@ import unittest
 from unittest.mock import patch
 
 from embedchain import App
+from embedchain.config import AppConfig
 
 
 class TestApp(unittest.TestCase):
     os.environ["OPENAI_API_KEY"] = "test_key"
 
     def setUp(self):
-        self.app = App()
+        self.app = App(config=AppConfig(collect_metrics=False))
 
     @patch("embedchain.embedchain.memory", autospec=True)
     @patch.object(App, "retrieve_from_database", return_value=["Test context"])

+ 1 - 1
tests/embedchain/test_embedchain.py

@@ -25,7 +25,7 @@ class TestChromaDbHostsLoglevel(unittest.TestCase):
         """
         Test if the `App` instance is initialized without a config that does not contain default hosts and ports.
         """
-        config = AppConfig(log_level="DEBUG")
+        config = AppConfig(log_level="DEBUG", collect_metrics=False)
 
         app = App(config)
 

+ 2 - 2
tests/embedchain/test_generate_prompt.py

@@ -2,12 +2,12 @@ import unittest
 from string import Template
 
 from embedchain import App
-from embedchain.embedchain import QueryConfig
+from embedchain.config import AppConfig, QueryConfig
 
 
 class TestGeneratePrompt(unittest.TestCase):
     def setUp(self):
-        self.app = App()
+        self.app = App(config=AppConfig(collect_metrics=False))
 
     def test_generate_prompt_with_template(self):
         """

+ 2 - 2
tests/embedchain/test_query.py

@@ -3,14 +3,14 @@ import unittest
 from unittest.mock import MagicMock, patch
 
 from embedchain import App
-from embedchain.embedchain import QueryConfig
+from embedchain.config import AppConfig, QueryConfig
 
 
 class TestApp(unittest.TestCase):
     os.environ["OPENAI_API_KEY"] = "test_key"
 
     def setUp(self):
-        self.app = App()
+        self.app = App(config=AppConfig(collect_metrics=False))
 
     @patch("chromadb.api.models.Collection.Collection.add", MagicMock)
     def test_query(self):

+ 23 - 23
tests/vectordb/test_chroma_db.py

@@ -39,7 +39,7 @@ class TestChromaDbHostsInit(unittest.TestCase):
         host = "test-host"
         port = "1234"
 
-        config = AppConfig(host=host, port=port)
+        config = AppConfig(host=host, port=port, collect_metrics=False)
 
         _app = App(config)
 
@@ -54,7 +54,7 @@ class TestChromaDbHostsNone(unittest.TestCase):
         Test if the `App` instance is initialized without default hosts and ports.
         """
 
-        _app = App()
+        _app = App(config=AppConfig(collect_metrics=False))
 
         self.assertEqual(mock_client.call_args[0][0].chroma_server_host, None)
         self.assertEqual(mock_client.call_args[0][0].chroma_server_http_port, None)
@@ -68,7 +68,7 @@ class TestChromaDbHostsLoglevel(unittest.TestCase):
         """
         config = AppConfig(log_level="DEBUG")
 
-        _app = App(config)
+        _app = App(config=AppConfig(collect_metrics=False))
 
         self.assertEqual(mock_client.call_args[0][0].chroma_server_host, None)
         self.assertEqual(mock_client.call_args[0][0].chroma_server_http_port, None)
@@ -82,7 +82,7 @@ class TestChromaDbDuplicateHandling:
         # Start with a clean app
         App().reset()
 
-        app = App()
+        app = App(config=AppConfig(collect_metrics=False))
         app.collection.add(embeddings=[[0, 0, 0]], ids=["0"])
         app.collection.add(embeddings=[[0, 0, 0]], ids=["0"])
         assert "Insert of existing embedding ID: 0" in caplog.text
@@ -97,7 +97,7 @@ class TestChromaDbDuplicateHandling:
         # Start with a clean app
         App().reset()
 
-        app = App()
+        app = App(config=AppConfig(collect_metrics=False))
         app.set_collection("test_collection_1")
         app.collection.add(embeddings=[[0, 0, 0]], ids=["0"])
         app.set_collection("test_collection_2")
@@ -111,7 +111,7 @@ class TestChromaDbCollection(unittest.TestCase):
         """
         Test if the `App` instance is initialized with the correct default collection name.
         """
-        app = App()
+        app = App(config=AppConfig(collect_metrics=False))
 
         self.assertEqual(app.collection.name, "embedchain_store")
 
@@ -119,7 +119,7 @@ class TestChromaDbCollection(unittest.TestCase):
         """
         Test if the `App` instance is initialized with the correct custom collection name.
         """
-        config = AppConfig(collection_name="test_collection")
+        config = AppConfig(collection_name="test_collection", collect_metrics=False)
         app = App(config)
 
         self.assertEqual(app.collection.name, "test_collection")
@@ -128,7 +128,7 @@ class TestChromaDbCollection(unittest.TestCase):
         """
         Test if the `App` collection is correctly switched using the `set_collection` method.
         """
-        app = App()
+        app = App(config=AppConfig(collect_metrics=False))
         app.set_collection("test_collection")
 
         self.assertEqual(app.collection.name, "test_collection")
@@ -140,7 +140,7 @@ class TestChromaDbCollection(unittest.TestCase):
         # Start with a clean app
         App().reset()
 
-        app = App()
+        app = App(config=AppConfig(collect_metrics=False))
         app.set_collection("test_collection_1")
         # Collection should be empty when created
         self.assertEqual(app.count(), 0)
@@ -166,12 +166,12 @@ class TestChromaDbCollection(unittest.TestCase):
         # Start with a clean app
         App().reset()
 
-        app = App()
+        app = App(config=AppConfig(collect_metrics=False))
         app.set_collection("test_collection_1")
         app.collection.add(embeddings=[[0, 0, 0]], ids=["0"])
         del app
 
-        app = App()
+        app = App(config=AppConfig(collect_metrics=False))
         app.set_collection("test_collection_1")
         self.assertEqual(app.count(), 1)
 
@@ -185,8 +185,8 @@ class TestChromaDbCollection(unittest.TestCase):
         App().reset()
 
         # Create two apps
-        app1 = App(AppConfig(collection_name="test_collection_1"))
-        app2 = App(AppConfig(collection_name="test_collection_2"))
+        app1 = App(AppConfig(collection_name="test_collection_1", collect_metrics=False))
+        app2 = App(AppConfig(collection_name="test_collection_2", collect_metrics=False))
 
         # app2 has been created last, but adding to app1 will still write to collection 1.
         app1.collection.add(embeddings=[0, 0, 0], ids=["0"])
@@ -211,8 +211,8 @@ class TestChromaDbCollection(unittest.TestCase):
         App().reset()
 
         # Create two apps
-        app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1"))
-        app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2"))
+        app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1", collect_metrics=False))
+        app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2", collect_metrics=False))
 
         # Add data
         app1.collection.add(embeddings=[[0, 0, 0], [1, 1, 1]], ids=["0", "1"])
@@ -231,10 +231,10 @@ class TestChromaDbCollection(unittest.TestCase):
 
         # Create four apps.
         # app1, which we are about to reset, shares an app with one, and an id with the other, none with the last.
-        app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1"))
-        app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2"))
-        app3 = App(AppConfig(collection_name="three_collection", id="new_app_id_1"))
-        app4 = App(AppConfig(collection_name="four_collection", id="new_app_id_4"))
+        app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1", collect_metrics=False))
+        app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2", collect_metrics=False))
+        app3 = App(AppConfig(collection_name="three_collection", id="new_app_id_1", collect_metrics=False))
+        app4 = App(AppConfig(collection_name="four_collection", id="new_app_id_4", collect_metrics=False))
 
         # Each one of them get data
         app1.collection.add(embeddings=[0, 0, 0], ids=["1"])
@@ -246,10 +246,10 @@ class TestChromaDbCollection(unittest.TestCase):
         app1.reset()
 
         # Reinstantiate them
-        app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1"))
-        app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2"))
-        app3 = App(AppConfig(collection_name="three_collection", id="new_app_id_3"))
-        app4 = App(AppConfig(collection_name="four_collection", id="new_app_id_3"))
+        app1 = App(AppConfig(collection_name="one_collection", id="new_app_id_1", collect_metrics=False))
+        app2 = App(AppConfig(collection_name="one_collection", id="new_app_id_2", collect_metrics=False))
+        app3 = App(AppConfig(collection_name="three_collection", id="new_app_id_3", collect_metrics=False))
+        app4 = App(AppConfig(collection_name="four_collection", id="new_app_id_3", collect_metrics=False))
 
         # All should be empty
         self.assertEqual(app1.count(), 0)