Explorar el Código

Merge pull request #33 from embedchain/add-caching-support

Skip creating embedding if it already exists
Taranjeet Singh hace 2 años
padre
commit
d9a6077b6a
Se han modificado 1 ficheros con 18 adiciones y 0 borrados
  1. 18 0
      embedchain/embedchain.py

+ 18 - 0
embedchain/embedchain.py

@@ -97,6 +97,24 @@ class EmbedChain:
         documents = embeddings_data["documents"]
         metadatas = embeddings_data["metadatas"]
         ids = embeddings_data["ids"]
+        # get existing ids, and discard doc if any common id exist.
+        existing_docs = self.collection.get(
+            ids=ids,
+            # where={"url": url}
+        )
+        existing_ids = set(existing_docs["ids"])
+
+        if len(existing_ids):
+            data_dict = {id: (doc, meta) for id, doc, meta in zip(ids, documents, metadatas)}
+            data_dict = {id: value for id, value in data_dict.items() if id not in existing_ids}
+
+            if not data_dict:
+                print(f"All data from {url} already exists in the database.")
+                return
+
+            ids = list(data_dict.keys())
+            documents, metadatas = zip(*data_dict.values())
+
         self.collection.add(
             documents=documents,
             metadatas=metadatas,