Browse Source

bug: Prevent clashing chunk IDs (#160)

This commit inserts a repeating chunk once only
preventing the chroma duplicate id error.
Hao (Harin) Wu 2 years ago
parent
commit
996211e23e
1 changed files with 6 additions and 3 deletions
  1. 6 3
      embedchain/chunkers/base_chunker.py

+ 6 - 3
embedchain/chunkers/base_chunker.py

@@ -14,6 +14,7 @@ class BaseChunker:
         """
         documents = []
         ids = []
+        idMap = {}
         datas = loader.load_data(src)
         metadatas = []
         for data in datas:
@@ -25,9 +26,11 @@ class BaseChunker:
 
             for chunk in chunks:
                 chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
-                ids.append(chunk_id)
-                documents.append(chunk)
-                metadatas.append(meta_data)
+                if (idMap.get(chunk_id) is None):
+                    idMap[chunk_id] = True
+                    ids.append(chunk_id)
+                    documents.append(chunk)
+                    metadatas.append(meta_data)
         return {
             "documents": documents,
             "ids": ids,