Bläddra i källkod

feat: added data format to metadata internally (#314)

aaishikdutta 2 år sedan
förälder
incheckning
c12362486f
2 ändrade filer med 13 tillägg och 1 borttagningar
  1. 10 0
      embedchain/chunkers/base_chunker.py
  2. 3 1
      embedchain/data_formatter/data_formatter.py

+ 10 - 0
embedchain/chunkers/base_chunker.py

@@ -5,6 +5,7 @@ class BaseChunker:
     def __init__(self, text_splitter):
         """Initialize the chunker."""
         self.text_splitter = text_splitter
+        self.data_type = None
 
     def create_chunks(self, loader, src):
         """
@@ -22,7 +23,10 @@ class BaseChunker:
         metadatas = []
         for data in datas:
             content = data["content"]
+
             meta_data = data["meta_data"]
+            # add data type to meta data to allow query using data type
+            meta_data["data_type"] = self.data_type
             url = meta_data["url"]
 
             chunks = self.get_chunks(content)
@@ -47,3 +51,9 @@ class BaseChunker:
         Override in child class if custom logic.
         """
         return self.text_splitter.split_text(content)
+
+    def set_data_type(self, data_type):
+        """
+        set the data type of chunker
+        """
+        self.data_type = data_type

+ 3 - 1
embedchain/data_formatter/data_formatter.py

@@ -69,6 +69,8 @@ class DataFormatter:
             "docs_site": DocsSiteChunker(config),
         }
         if data_type in chunkers:
-            return chunkers[data_type]
+            chunker = chunkers[data_type]
+            chunker.set_data_type(data_type)
+            return chunker
         else:
             raise ValueError(f"Unsupported data type: {data_type}")