2 years ago · d2e8f796ca
--- a/README.md
+++ b/README.md
@@ -294,6 +294,13 @@ To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple.
 
				 ```python
			
 
				 app.add_local('qna_pair', ("Question", "Answer"))
			
 
				 ```
			
 
				+### Sitemap
			
 
				+
			
 
				+To add a XML site map containing list of all urls, use the data_type as `sitemap` and enter the sitemap url. Eg:
			
 
				+
			
 
				+```python
			
 
				+app.add('sitemap', 'a_valid_sitemap_url/sitemap.xml')
			
 
				+```
			
 
				 
			
 
				 ### Reusing a Vector DB
			
 
				 
			
--- a/embedchain/config/InitConfig.py
+++ b/embedchain/config/InitConfig.py
@@ -62,6 +62,7 @@ class InitConfig(BaseConfig):
 
				         Sets database to default (`ChromaDb`).
			
 
				         """
			
 
				         from embedchain.vectordb.chroma_db import ChromaDB
			
 
				+
			
 
				         self.db = ChromaDB(ef=self.ef, host=self.host, port=self.port)
			
 
				 
			
 
				     def _setup_logging(self, debug_level):
			
--- a/embedchain/data_formatter/data_formatter.py
+++ b/embedchain/data_formatter/data_formatter.py
@@ -9,6 +9,7 @@ from embedchain.loaders.docx_file import DocxFileLoader
 
				 from embedchain.loaders.local_qna_pair import LocalQnaPairLoader
			
 
				 from embedchain.loaders.local_text import LocalTextLoader
			
 
				 from embedchain.loaders.pdf_file import PdfFileLoader
			
 
				+from embedchain.loaders.sitemap import SitemapLoader
			
 
				 from embedchain.loaders.web_page import WebPageLoader
			
 
				 from embedchain.loaders.youtube_video import YoutubeVideoLoader
			
 
				 
			
@@ -39,6 +40,7 @@ class DataFormatter:
 
				             "qna_pair": LocalQnaPairLoader(),
			
 
				             "text": LocalTextLoader(),
			
 
				             "docx": DocxFileLoader(),
			
 
				+            "sitemap": SitemapLoader(),
			
 
				         }
			
 
				         if data_type in loaders:
			
 
				             return loaders[data_type]
			
@@ -60,6 +62,7 @@ class DataFormatter:
 
				             "qna_pair": QnaPairChunker(config),
			
 
				             "text": TextChunker(config),
			
 
				             "docx": DocxFileChunker(config),
			
 
				+            "sitemap": WebPageChunker(config),
			
 
				         }
			
 
				         if data_type in chunkers:
			
 
				             return chunkers[data_type]
			
--- a/embedchain/loaders/sitemap.py
+++ b/embedchain/loaders/sitemap.py
@@ -0,0 +1,24 @@
 
				+import requests
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+from embedchain.loaders.web_page import WebPageLoader
			
 
				+
			
 
				+
			
 
				+class SitemapLoader:
			
 
				+    def load_data(self, sitemap_url):
			
 
				+        """
			
 
				+        This method takes a sitemap URL as input and retrieves
			
 
				+        all the URLs to use the WebPageLoader to load content
			
 
				+        of each page.
			
 
				+        """
			
 
				+        output = []
			
 
				+        web_page_loader = WebPageLoader()
			
 
				+        response = requests.get(sitemap_url)
			
 
				+        response.raise_for_status()
			
 
				+
			
 
				+        soup = BeautifulSoup(response.text, "xml")
			
 
				+        links = [link.text for link in soup.find_all("loc")]
			
 
				+        for link in links:
			
 
				+            each_load_data = web_page_loader.load_data(link)
			
 
				+            output.append(each_load_data)
			
 
				+        return [data[0] for data in output]
			
--- a/embedchain/vectordb/chroma_db.py
+++ b/embedchain/vectordb/chroma_db.py
@@ -1,5 +1,5 @@
 
				-import os
			
 
				 import logging
			
 
				+import os
			
 
				 
			
 
				 import chromadb
			
 
				 from chromadb.utils import embedding_functions
			
--- a/setup.py
+++ b/setup.py
@@ -29,6 +29,7 @@ setuptools.setup(
 
				         "beautifulsoup4",
			
 
				         "pypdf",
			
 
				         "pytube",
			
 
				+        "lxml",
			
 
				         "gpt4all",
			
 
				         "sentence_transformers",
			
 
				         "docx2txt",