|
@@ -21,7 +21,8 @@ from embedchain.embedder.base import BaseEmbedder
|
|
|
from embedchain.helper.json_serializable import JSONSerializable
|
|
|
from embedchain.llm.base import BaseLlm
|
|
|
from embedchain.loaders.base_loader import BaseLoader
|
|
|
-from embedchain.models.data_type import DataType
|
|
|
+from embedchain.models.data_type import (DataType, DirectDataType,
|
|
|
+ IndirectDataType, SpecialDataType)
|
|
|
from embedchain.utils import detect_datatype
|
|
|
from embedchain.vectordb.base import BaseVectorDB
|
|
|
|
|
@@ -339,16 +340,53 @@ class EmbedChain(JSONSerializable):
|
|
|
:param source_id: Hexadecimal hash of the source.
|
|
|
:return: (List) documents (embedded text), (List) metadata, (list) ids, (int) number of chunks
|
|
|
"""
|
|
|
- existing_embeddings_data = self.db.get(
|
|
|
- where={
|
|
|
- "url": src,
|
|
|
- },
|
|
|
- limit=1,
|
|
|
- )
|
|
|
- try:
|
|
|
- existing_doc_id = existing_embeddings_data.get("metadatas", [])[0]["doc_id"]
|
|
|
- except Exception:
|
|
|
+ # Find existing embeddings for the source
|
|
|
+ # Depending on the data type, existing embeddings are checked for.
|
|
|
+ if chunker.data_type.value in [item.value for item in DirectDataType]:
|
|
|
+ # DirectDataTypes can't be updated.
|
|
|
+ # Think of a text:
|
|
|
+ # Either it's the same, then it won't change, so it's not an update.
|
|
|
+ # Or it's different, then it will be added as a new text.
|
|
|
existing_doc_id = None
|
|
|
+ elif chunker.data_type.value in [item.value for item in IndirectDataType]:
|
|
|
+ # These types have a indirect source reference
|
|
|
+ # As long as the reference is the same, they can be updated.
|
|
|
+ existing_embeddings_data = self.db.get(
|
|
|
+ where={
|
|
|
+ "url": src,
|
|
|
+ },
|
|
|
+ limit=1,
|
|
|
+ )
|
|
|
+ try:
|
|
|
+ existing_doc_id = existing_embeddings_data.get("metadatas", [])[0]["doc_id"]
|
|
|
+ except Exception:
|
|
|
+ existing_doc_id = None
|
|
|
+ elif chunker.data_type.value in [item.value for item in SpecialDataType]:
|
|
|
+ # These types don't contain indirect references.
|
|
|
+ # Through custom logic, they can be attributed to a source and be updated.
|
|
|
+ if chunker.data_type == DataType.QNA_PAIR:
|
|
|
+ # QNA_PAIRs update the answer if the question already exists.
|
|
|
+ existing_embeddings_data = self.db.get(
|
|
|
+ where={
|
|
|
+ "question": src[0],
|
|
|
+ },
|
|
|
+ limit=1,
|
|
|
+ )
|
|
|
+ try:
|
|
|
+ existing_doc_id = existing_embeddings_data.get("metadatas", [])[0]["doc_id"]
|
|
|
+ except Exception:
|
|
|
+ existing_doc_id = None
|
|
|
+ else:
|
|
|
+ raise NotImplementedError(
|
|
|
+ f"SpecialDataType {chunker.data_type} must have a custom logic to check for existing data"
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ raise TypeError(
|
|
|
+ f"{chunker.data_type} is type {type(chunker.data_type)}. "
|
|
|
+ "When it should be DirectDataType, IndirectDataType or SpecialDataType."
|
|
|
+ )
|
|
|
+
|
|
|
+ # Create chunks
|
|
|
embeddings_data = chunker.create_chunks(loader, src)
|
|
|
|
|
|
# spread chunking results
|