|
@@ -322,26 +322,10 @@ class EmbedChain(JSONSerializable):
|
|
count_new_chunks = self.db.count() - chunks_before_addition
|
|
count_new_chunks = self.db.count() - chunks_before_addition
|
|
print((f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}"))
|
|
print((f"Successfully saved {src} ({chunker.data_type}). New chunks count: {count_new_chunks}"))
|
|
return list(documents), metadatas, ids, count_new_chunks
|
|
return list(documents), metadatas, ids, count_new_chunks
|
|
-
|
|
|
|
- def load_and_embed_v2(
|
|
|
|
- self,
|
|
|
|
- loader: BaseLoader,
|
|
|
|
- chunker: BaseChunker,
|
|
|
|
- src: Any,
|
|
|
|
- metadata: Optional[Dict[str, Any]] = None,
|
|
|
|
- source_id: Optional[str] = None,
|
|
|
|
- dry_run=False,
|
|
|
|
- ):
|
|
|
|
|
|
+
|
|
|
|
+ def _get_existing_doc_id(self, chunker: BaseChunker, src: Any):
|
|
"""
|
|
"""
|
|
- Loads the data from the given URL, chunks it, and adds it to database.
|
|
|
|
-
|
|
|
|
- :param loader: The loader to use to load the data.
|
|
|
|
- :param chunker: The chunker to use to chunk the data.
|
|
|
|
- :param src: The data to be handled by the loader. Can be a URL for
|
|
|
|
- remote sources or local content for local loaders.
|
|
|
|
- :param metadata: Optional. Metadata associated with the data source.
|
|
|
|
- :param source_id: Hexadecimal hash of the source.
|
|
|
|
- :return: (List) documents (embedded text), (List) metadata, (list) ids, (int) number of chunks
|
|
|
|
|
|
+ Get id of existing document for a given source, based on the data type
|
|
"""
|
|
"""
|
|
# Find existing embeddings for the source
|
|
# Find existing embeddings for the source
|
|
# Depending on the data type, existing embeddings are checked for.
|
|
# Depending on the data type, existing embeddings are checked for.
|
|
@@ -350,7 +334,7 @@ class EmbedChain(JSONSerializable):
|
|
# Think of a text:
|
|
# Think of a text:
|
|
# Either it's the same, then it won't change, so it's not an update.
|
|
# Either it's the same, then it won't change, so it's not an update.
|
|
# Or it's different, then it will be added as a new text.
|
|
# Or it's different, then it will be added as a new text.
|
|
- existing_doc_id = None
|
|
|
|
|
|
+ return None
|
|
elif chunker.data_type.value in [item.value for item in IndirectDataType]:
|
|
elif chunker.data_type.value in [item.value for item in IndirectDataType]:
|
|
# These types have a indirect source reference
|
|
# These types have a indirect source reference
|
|
# As long as the reference is the same, they can be updated.
|
|
# As long as the reference is the same, they can be updated.
|
|
@@ -360,10 +344,10 @@ class EmbedChain(JSONSerializable):
|
|
},
|
|
},
|
|
limit=1,
|
|
limit=1,
|
|
)
|
|
)
|
|
- try:
|
|
|
|
- existing_doc_id = existing_embeddings_data.get("metadatas", [])[0]["doc_id"]
|
|
|
|
- except Exception:
|
|
|
|
- existing_doc_id = None
|
|
|
|
|
|
+ if len(existing_embeddings_data.get("metadatas", [])) > 0:
|
|
|
|
+ return existing_embeddings_data["metadatas"][0]["doc_id"]
|
|
|
|
+ else:
|
|
|
|
+ return None
|
|
elif chunker.data_type.value in [item.value for item in SpecialDataType]:
|
|
elif chunker.data_type.value in [item.value for item in SpecialDataType]:
|
|
# These types don't contain indirect references.
|
|
# These types don't contain indirect references.
|
|
# Through custom logic, they can be attributed to a source and be updated.
|
|
# Through custom logic, they can be attributed to a source and be updated.
|
|
@@ -375,10 +359,10 @@ class EmbedChain(JSONSerializable):
|
|
},
|
|
},
|
|
limit=1,
|
|
limit=1,
|
|
)
|
|
)
|
|
- try:
|
|
|
|
- existing_doc_id = existing_embeddings_data.get("metadatas", [])[0]["doc_id"]
|
|
|
|
- except Exception:
|
|
|
|
- existing_doc_id = None
|
|
|
|
|
|
+ if len(existing_embeddings_data.get("metadatas", [])) > 0:
|
|
|
|
+ return existing_embeddings_data["metadatas"][0]["doc_id"]
|
|
|
|
+ else:
|
|
|
|
+ return None
|
|
else:
|
|
else:
|
|
raise NotImplementedError(
|
|
raise NotImplementedError(
|
|
f"SpecialDataType {chunker.data_type} must have a custom logic to check for existing data"
|
|
f"SpecialDataType {chunker.data_type} must have a custom logic to check for existing data"
|
|
@@ -389,6 +373,28 @@ class EmbedChain(JSONSerializable):
|
|
"When it should be DirectDataType, IndirectDataType or SpecialDataType."
|
|
"When it should be DirectDataType, IndirectDataType or SpecialDataType."
|
|
)
|
|
)
|
|
|
|
|
|
|
|
+ def load_and_embed_v2(
|
|
|
|
+ self,
|
|
|
|
+ loader: BaseLoader,
|
|
|
|
+ chunker: BaseChunker,
|
|
|
|
+ src: Any,
|
|
|
|
+ metadata: Optional[Dict[str, Any]] = None,
|
|
|
|
+ source_id: Optional[str] = None,
|
|
|
|
+ dry_run=False,
|
|
|
|
+ ):
|
|
|
|
+ """
|
|
|
|
+ Loads the data from the given URL, chunks it, and adds it to database.
|
|
|
|
+
|
|
|
|
+ :param loader: The loader to use to load the data.
|
|
|
|
+ :param chunker: The chunker to use to chunk the data.
|
|
|
|
+ :param src: The data to be handled by the loader. Can be a URL for
|
|
|
|
+ remote sources or local content for local loaders.
|
|
|
|
+ :param metadata: Optional. Metadata associated with the data source.
|
|
|
|
+ :param source_id: Hexadecimal hash of the source.
|
|
|
|
+ :return: (List) documents (embedded text), (List) metadata, (list) ids, (int) number of chunks
|
|
|
|
+ """
|
|
|
|
+ existing_doc_id = self._get_existing_doc_id(chunker=chunker, src=src)
|
|
|
|
+
|
|
# Create chunks
|
|
# Create chunks
|
|
embeddings_data = chunker.create_chunks(loader, src)
|
|
embeddings_data = chunker.create_chunks(loader, src)
|
|
|
|
|