Jelajahi Sumber

feat: add local qna pair

cachho 2 tahun lalu
induk
melakukan
ff2d5ce7fa

+ 16 - 0
embedchain/chunkers/qna_pair.py

@@ -0,0 +1,16 @@
+from embedchain.chunkers.base_chunker import BaseChunker
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+
+TEXT_SPLITTER_CHUNK_PARAMS = {
+    "chunk_size": 300,
+    "chunk_overlap": 0,
+    "length_function": len,
+}
+
+
+class QnaPairChunker(BaseChunker):
+    def __init__(self):
+        text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
+        super().__init__(text_splitter)

+ 20 - 2
embedchain/embedchain.py

@@ -8,9 +8,11 @@ from langchain.embeddings.openai import OpenAIEmbeddings
 from embedchain.loaders.youtube_video import YoutubeVideoLoader
 from embedchain.loaders.pdf_file import PdfFileLoader
 from embedchain.loaders.web_page import WebPageLoader
+from embedchain.loaders_local.qna_pair import QnaPairLoader
 from embedchain.chunkers.youtube_video import YoutubeVideoChunker
 from embedchain.chunkers.pdf_file import PdfFileChunker
 from embedchain.chunkers.web_page import WebPageChunker
+from embedchain.chunkers.qna_pair import QnaPairChunker
 from embedchain.vectordb.chroma_db import ChromaDB
 
 load_dotenv()
@@ -46,7 +48,8 @@ class EmbedChain:
         loaders = {
             'youtube_video': YoutubeVideoLoader(),
             'pdf_file': PdfFileLoader(),
-            'web_page': WebPageLoader()
+            'web_page': WebPageLoader(),
+            'qna_pair': QnaPairLoader()
         }
         if data_type in loaders:
             return loaders[data_type]
@@ -64,7 +67,8 @@ class EmbedChain:
         chunkers = {
             'youtube_video': YoutubeVideoChunker(),
             'pdf_file': PdfFileChunker(),
-            'web_page': WebPageChunker()
+            'web_page': WebPageChunker(),
+            'qna_pair': QnaPairChunker(),
         }
         if data_type in chunkers:
             return chunkers[data_type]
@@ -85,6 +89,20 @@ class EmbedChain:
         self.user_asks.append([data_type, url])
         self.load_and_embed(loader, chunker, url)
 
+    def add_local(self, data_type, content):
+        """
+        Adds the data you supply to the vector db.
+        Loads the data, chunks it, create embedding for each chunk
+        and then stores the embedding to vector database.
+
+        :param data_type: The type of the data to add.
+        :param content: The local data. Refer to the `README` for formatting.
+        """
+        loader = self._get_loader(data_type)
+        chunker = self._get_chunker(data_type)
+        self.user_asks.append([data_type, content])
+        self.load_and_embed(loader, chunker, content)
+
     def load_and_embed(self, loader, chunker, url):
         """
         Loads the data from the given URL, chunks it, and adds it to the database.

+ 0 - 0
embedchain/loaders_local/__init__.py


+ 16 - 0
embedchain/loaders_local/qna_pair.py

@@ -0,0 +1,16 @@
+from embedchain.utils import markdown_to_plaintext
+
+
+class QnaPairLoader:
+
+    def load_data(self, content):
+        question, answer = content
+        answer = markdown_to_plaintext(answer)
+        content = f"Q: {question}\nA: {answer}"
+        meta_data = {
+            "url": "local",
+        }
+        return [{
+            "content": content,
+            "meta_data": meta_data,
+        }]