Browse Source

feat: add SitemapLoader (#68)

aaishikdutta 2 years ago
parent
commit
d2e8f796ca

+ 7 - 0
README.md

@@ -294,6 +294,13 @@ To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple.
 ```python
 app.add_local('qna_pair', ("Question", "Answer"))
 ```
+### Sitemap
+
+To add a XML site map containing list of all urls, use the data_type as `sitemap` and enter the sitemap url. Eg:
+
+```python
+app.add('sitemap', 'a_valid_sitemap_url/sitemap.xml')
+```
 
 ### Reusing a Vector DB
 

+ 1 - 0
embedchain/config/InitConfig.py

@@ -62,6 +62,7 @@ class InitConfig(BaseConfig):
         Sets database to default (`ChromaDb`).
         """
         from embedchain.vectordb.chroma_db import ChromaDB
+
         self.db = ChromaDB(ef=self.ef, host=self.host, port=self.port)
 
     def _setup_logging(self, debug_level):

+ 3 - 0
embedchain/data_formatter/data_formatter.py

@@ -9,6 +9,7 @@ from embedchain.loaders.docx_file import DocxFileLoader
 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
 from embedchain.loaders.local_text import LocalTextLoader
 from embedchain.loaders.pdf_file import PdfFileLoader
+from embedchain.loaders.sitemap import SitemapLoader
 from embedchain.loaders.web_page import WebPageLoader
 from embedchain.loaders.youtube_video import YoutubeVideoLoader
 
@@ -39,6 +40,7 @@ class DataFormatter:
             "qna_pair": LocalQnaPairLoader(),
             "text": LocalTextLoader(),
             "docx": DocxFileLoader(),
+            "sitemap": SitemapLoader(),
         }
         if data_type in loaders:
             return loaders[data_type]
@@ -60,6 +62,7 @@ class DataFormatter:
             "qna_pair": QnaPairChunker(config),
             "text": TextChunker(config),
             "docx": DocxFileChunker(config),
+            "sitemap": WebPageChunker(config),
         }
         if data_type in chunkers:
             return chunkers[data_type]

+ 24 - 0
embedchain/loaders/sitemap.py

@@ -0,0 +1,24 @@
+import requests
+from bs4 import BeautifulSoup
+
+from embedchain.loaders.web_page import WebPageLoader
+
+
+class SitemapLoader:
+    def load_data(self, sitemap_url):
+        """
+        This method takes a sitemap URL as input and retrieves
+        all the URLs to use the WebPageLoader to load content
+        of each page.
+        """
+        output = []
+        web_page_loader = WebPageLoader()
+        response = requests.get(sitemap_url)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, "xml")
+        links = [link.text for link in soup.find_all("loc")]
+        for link in links:
+            each_load_data = web_page_loader.load_data(link)
+            output.append(each_load_data)
+        return [data[0] for data in output]

+ 1 - 1
embedchain/vectordb/chroma_db.py

@@ -1,5 +1,5 @@
-import os
 import logging
+import os
 
 import chromadb
 from chromadb.utils import embedding_functions

+ 1 - 0
setup.py

@@ -29,6 +29,7 @@ setuptools.setup(
         "beautifulsoup4",
         "pypdf",
         "pytube",
+        "lxml",
         "gpt4all",
         "sentence_transformers",
         "docx2txt",