Browse Source

feat: Add new data type: code_docs_loader (#274)

Taranjeet Singh 2 years ago
parent
commit
86e4146126

+ 12 - 2
README.md

@@ -29,6 +29,8 @@ embedchain is a framework to easily create LLM powered bots over any dataset. If
     - [Doc File](#doc-file)
     - [Doc File](#doc-file)
     - [Text](#text)
     - [Text](#text)
     - [QnA Pair](#qna-pair)
     - [QnA Pair](#qna-pair)
+    - [Sitemap](#sitemap)
+    - [Code Docs Page](#code-docs-page)
     - [Reusing a Vector DB](#reusing-a-vector-db)
     - [Reusing a Vector DB](#reusing-a-vector-db)
     - [More Formats coming soon](#more-formats-coming-soon)
     - [More Formats coming soon](#more-formats-coming-soon)
   - [Testing](#testing)
   - [Testing](#testing)
@@ -302,6 +304,14 @@ To add a XML site map containing list of all urls, use the data_type as `sitemap
 app.add('sitemap', 'a_valid_sitemap_url/sitemap.xml')
 app.add('sitemap', 'a_valid_sitemap_url/sitemap.xml')
 ```
 ```
 
 
+### Code Docs Page
+
+To add a code documentation page, use the data_type as `code_docs_page` and enter the url. Eg:
+
+```python
+app.add("code_docs_page", "https://python.langchain.com/docs/modules/data_connection/vectorstores/integrations/cassandra")
+```
+
 ### Reusing a Vector DB
 ### Reusing a Vector DB
 
 
 Default behavior is to create a persistent vector DB in the directory **./db**. You can split your application into two Python scripts: one to create a local vector DB and the other to reuse this local persistent vector DB. This is useful when you want to index hundreds of documents and separately implement a chat interface.
 Default behavior is to create a persistent vector DB in the directory **./db**. You can split your application into two Python scripts: one to create a local vector DB and the other to reuse this local persistent vector DB. This is useful when you want to index hundreds of documents and separately implement a chat interface.
@@ -425,9 +435,9 @@ einstein_chat_template = Template("""
         You are Albert Einstein, a German-born theoretical physicist,
         You are Albert Einstein, a German-born theoretical physicist,
         widely ranked among the greatest and most influential scientists of all time.
         widely ranked among the greatest and most influential scientists of all time.
 
 
-        Use the following information about Albert Einstein to respond to 
+        Use the following information about Albert Einstein to respond to
         the human's query acting as Albert Einstein.
         the human's query acting as Albert Einstein.
-        Context: $context                                
+        Context: $context
 
 
         Keep the response brief. If you don't know the answer, just say that you don't know, don't try to make up an answer.
         Keep the response brief. If you don't know the answer, just say that you don't know, don't try to make up an answer.
 
 

+ 9 - 1
embedchain/chunkers/base_chunker.py

@@ -25,7 +25,7 @@ class BaseChunker:
             meta_data = data["meta_data"]
             meta_data = data["meta_data"]
             url = meta_data["url"]
             url = meta_data["url"]
 
 
-            chunks = self.text_splitter.split_text(content)
+            chunks = self.get_chunks(content)
 
 
             for chunk in chunks:
             for chunk in chunks:
                 chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
                 chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
@@ -39,3 +39,11 @@ class BaseChunker:
             "ids": ids,
             "ids": ids,
             "metadatas": metadatas,
             "metadatas": metadatas,
         }
         }
+
+    def get_chunks(self, content):
+        """
+        Returns chunks using text splitter instance.
+
+        Override in child class if custom logic.
+        """
+        return self.text_splitter.split_text(content)

+ 22 - 0
embedchain/chunkers/code_docs_page.py

@@ -0,0 +1,22 @@
+from typing import Optional
+
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+from embedchain.chunkers.base_chunker import BaseChunker
+from embedchain.config.AddConfig import ChunkerConfig
+
+TEXT_SPLITTER_CHUNK_PARAMS = {
+    "chunk_size": 500,
+    "chunk_overlap": 50,
+    "length_function": len,
+}
+
+
+class CodeDocsPageChunker(BaseChunker):
+    """Chunker for code docs page."""
+
+    def __init__(self, config: Optional[ChunkerConfig] = None):
+        if config is None:
+            config = TEXT_SPLITTER_CHUNK_PARAMS
+        text_splitter = RecursiveCharacterTextSplitter(**config)
+        super().__init__(text_splitter)

+ 13 - 1
embedchain/config/QueryConfig.py

@@ -17,7 +17,7 @@ DEFAULT_PROMPT = """
 DEFAULT_PROMPT_WITH_HISTORY = """
 DEFAULT_PROMPT_WITH_HISTORY = """
   Use the following pieces of context to answer the query at the end.
   Use the following pieces of context to answer the query at the end.
   If you don't know the answer, just say that you don't know, don't try to make up an answer.
   If you don't know the answer, just say that you don't know, don't try to make up an answer.
-  I will provide you with our conversation history. 
+  I will provide you with our conversation history.
 
 
   $context
   $context
 
 
@@ -28,8 +28,20 @@ DEFAULT_PROMPT_WITH_HISTORY = """
   Helpful Answer:
   Helpful Answer:
 """  # noqa:E501
 """  # noqa:E501
 
 
+CODE_DOCS_PAGE_DEFAULT_PROMPT = """
+  Use the following pieces of context to answer the query at the end.
+  If you don't know the answer, just say that you don't know, don't try to make up an answer. Wherever possible, give complete code snippet. Dont make up any code snippet on your own.
+
+  $context
+
+  Query: $query
+
+  Helpful Answer:
+"""  # noqa:E501
+
 DEFAULT_PROMPT_TEMPLATE = Template(DEFAULT_PROMPT)
 DEFAULT_PROMPT_TEMPLATE = Template(DEFAULT_PROMPT)
 DEFAULT_PROMPT_WITH_HISTORY_TEMPLATE = Template(DEFAULT_PROMPT_WITH_HISTORY)
 DEFAULT_PROMPT_WITH_HISTORY_TEMPLATE = Template(DEFAULT_PROMPT_WITH_HISTORY)
+CODE_DOCS_PAGE_PROMPT_TEMPLATE = Template(CODE_DOCS_PAGE_DEFAULT_PROMPT)
 query_re = re.compile(r"\$\{*query\}*")
 query_re = re.compile(r"\$\{*query\}*")
 context_re = re.compile(r"\$\{*context\}*")
 context_re = re.compile(r"\$\{*context\}*")
 history_re = re.compile(r"\$\{*history\}*")
 history_re = re.compile(r"\$\{*history\}*")

+ 4 - 0
embedchain/data_formatter/data_formatter.py

@@ -1,3 +1,4 @@
+from embedchain.chunkers.code_docs_page import CodeDocsPageChunker
 from embedchain.chunkers.docx_file import DocxFileChunker
 from embedchain.chunkers.docx_file import DocxFileChunker
 from embedchain.chunkers.pdf_file import PdfFileChunker
 from embedchain.chunkers.pdf_file import PdfFileChunker
 from embedchain.chunkers.qna_pair import QnaPairChunker
 from embedchain.chunkers.qna_pair import QnaPairChunker
@@ -5,6 +6,7 @@ from embedchain.chunkers.text import TextChunker
 from embedchain.chunkers.web_page import WebPageChunker
 from embedchain.chunkers.web_page import WebPageChunker
 from embedchain.chunkers.youtube_video import YoutubeVideoChunker
 from embedchain.chunkers.youtube_video import YoutubeVideoChunker
 from embedchain.config import AddConfig
 from embedchain.config import AddConfig
+from embedchain.loaders.code_docs_page import CodeDocsPageLoader
 from embedchain.loaders.docx_file import DocxFileLoader
 from embedchain.loaders.docx_file import DocxFileLoader
 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
 from embedchain.loaders.local_text import LocalTextLoader
 from embedchain.loaders.local_text import LocalTextLoader
@@ -41,6 +43,7 @@ class DataFormatter:
             "text": LocalTextLoader(),
             "text": LocalTextLoader(),
             "docx": DocxFileLoader(),
             "docx": DocxFileLoader(),
             "sitemap": SitemapLoader(),
             "sitemap": SitemapLoader(),
+            "code_docs_page": CodeDocsPageLoader(),
         }
         }
         if data_type in loaders:
         if data_type in loaders:
             return loaders[data_type]
             return loaders[data_type]
@@ -63,6 +66,7 @@ class DataFormatter:
             "text": TextChunker(config),
             "text": TextChunker(config),
             "docx": DocxFileChunker(config),
             "docx": DocxFileChunker(config),
             "sitemap": WebPageChunker(config),
             "sitemap": WebPageChunker(config),
+            "code_docs_page": CodeDocsPageChunker(config)
         }
         }
         if data_type in chunkers:
         if data_type in chunkers:
             return chunkers[data_type]
             return chunkers[data_type]

+ 10 - 2
embedchain/embedchain.py

@@ -9,7 +9,7 @@ from langchain.docstore.document import Document
 from langchain.memory import ConversationBufferMemory
 from langchain.memory import ConversationBufferMemory
 
 
 from embedchain.config import AddConfig, ChatConfig, InitConfig, QueryConfig
 from embedchain.config import AddConfig, ChatConfig, InitConfig, QueryConfig
-from embedchain.config.QueryConfig import DEFAULT_PROMPT
+from embedchain.config.QueryConfig import DEFAULT_PROMPT, CODE_DOCS_PAGE_PROMPT_TEMPLATE
 from embedchain.data_formatter import DataFormatter
 from embedchain.data_formatter import DataFormatter
 
 
 gpt4all_model = None
 gpt4all_model = None
@@ -35,6 +35,7 @@ class EmbedChain:
         self.db_client = self.config.db.client
         self.db_client = self.config.db.client
         self.collection = self.config.db.collection
         self.collection = self.config.db.collection
         self.user_asks = []
         self.user_asks = []
+        self.is_code_docs_instance = False
 
 
     def add(self, data_type, url, metadata=None, config: AddConfig = None):
     def add(self, data_type, url, metadata=None, config: AddConfig = None):
         """
         """
@@ -56,6 +57,8 @@ class EmbedChain:
         self.load_and_embed(
         self.load_and_embed(
             data_formatter.loader, data_formatter.chunker, url, metadata
             data_formatter.loader, data_formatter.chunker, url, metadata
         )
         )
+        if data_type in ("code_docs_page", ):
+            self.is_code_docs_instance = True
 
 
     def add_local(self, data_type, content, metadata=None, config: AddConfig = None):
     def add_local(self, data_type, content, metadata=None, config: AddConfig = None):
         """
         """
@@ -211,6 +214,9 @@ class EmbedChain:
         """
         """
         if config is None:
         if config is None:
             config = QueryConfig()
             config = QueryConfig()
+        if self.is_code_docs_instance:
+            config.template = CODE_DOCS_PAGE_PROMPT_TEMPLATE
+            config.number_documents = 5
         contexts = self.retrieve_from_database(input_query, config)
         contexts = self.retrieve_from_database(input_query, config)
         prompt = self.generate_prompt(input_query, contexts, config)
         prompt = self.generate_prompt(input_query, contexts, config)
         logging.info(f"Prompt: {prompt}")
         logging.info(f"Prompt: {prompt}")
@@ -244,7 +250,9 @@ class EmbedChain:
         """
         """
         if config is None:
         if config is None:
             config = ChatConfig()
             config = ChatConfig()
-
+        if self.is_code_docs_instance:
+            config.template = CODE_DOCS_PAGE_PROMPT_TEMPLATE
+            config.number_documents = 5
         contexts = self.retrieve_from_database(input_query, config)
         contexts = self.retrieve_from_database(input_query, config)
 
 
         global memory
         global memory

+ 63 - 0
embedchain/loaders/code_docs_page.py

@@ -0,0 +1,63 @@
+import requests
+from bs4 import BeautifulSoup
+
+from embedchain.utils import clean_string
+
+class CodeDocsPageLoader:
+    def load_data(self, url):
+        """Load data from a web page."""
+        response = requests.get(url)
+        data = response.content
+        soup = BeautifulSoup(data, "html.parser")
+        selectors = [
+            'article.bd-article',
+            'article[role="main"]',
+            'div.md-content',
+            'div[role="main"]',
+            'div.container',
+            'div.section',
+            'article',
+            'main',
+        ]
+        content = None
+        for selector in selectors:
+            element = soup.select_one(selector)
+            if element is not None:
+                content = element.prettify()
+                break
+        if not content:
+            content = soup.get_text()
+        soup = BeautifulSoup(content, "html.parser")
+        for tag in soup(
+            [
+                "nav",
+                "aside",
+                "form",
+                "header",
+                "noscript",
+                "svg",
+                "canvas",
+                "footer",
+                "script",
+                "style",
+            ]
+        ):
+            tag.string = " "
+        for div in soup.find_all("div", {'class': 'cell_output'}):
+            div.decompose()
+        for div in soup.find_all("div", {'class': 'output_wrapper'}):
+            div.decompose()
+        for div in soup.find_all("div", {'class': 'output'}):
+            div.decompose()
+        content = clean_string(soup.get_text())
+        output = []
+        meta_data = {
+            "url": url,
+        }
+        output.append(
+            {
+                "content": content,
+                "meta_data": meta_data,
+            }
+        )
+        return output