Pārlūkot izejas kodu

[Bugfix] fix sitemap loader (#1000)

Co-authored-by: Deven Patel <deven298@yahoo.com>
Deven Patel 1 gadu atpakaļ
vecāks
revīzija
ff4a333be7

+ 2 - 1
embedchain/loaders/beehiiv.py

@@ -1,9 +1,10 @@
 import hashlib
 import logging
 import time
-import requests
 from xml.etree import ElementTree
 
+import requests
+
 from embedchain.helpers.json_serializable import register_deserializable
 from embedchain.loaders.base_loader import BaseLoader
 from embedchain.utils import is_readable

+ 2 - 1
embedchain/loaders/rss_feed.py

@@ -28,7 +28,8 @@ class RSSFeedLoader(BaseLoader):
     @staticmethod
     def get_rss_content(url: str):
         try:
-            from langchain.document_loaders import RSSFeedLoader as LangchainRSSFeedLoader
+            from langchain.document_loaders import \
+                RSSFeedLoader as LangchainRSSFeedLoader
         except ImportError:
             raise ImportError(
                 """RSSFeedLoader file requires extra dependencies.

+ 2 - 0
embedchain/loaders/sitemap.py

@@ -37,9 +37,11 @@ class SitemapLoader(BaseLoader):
         if urlparse(sitemap_url).scheme in ["http", "https"]:
             response = requests.get(sitemap_url)
             response.raise_for_status()
+            soup = BeautifulSoup(response.text, "xml")
         else:
             with open(sitemap_url, "r") as file:
                 soup = BeautifulSoup(file, "xml")
+
         links = [link.text for link in soup.find_all("loc") if link.parent.name == "url"]
         if len(links) == 0:
             links = [link.text for link in soup.find_all("loc")]

+ 2 - 1
embedchain/loaders/substack.py

@@ -1,9 +1,10 @@
 import hashlib
 import logging
 import time
+from xml.etree import ElementTree
 
 import requests
-from xml.etree import ElementTree
+
 from embedchain.helpers.json_serializable import register_deserializable
 from embedchain.loaders.base_loader import BaseLoader
 from embedchain.utils import is_readable

+ 2 - 1
embedchain/utils.py

@@ -196,7 +196,8 @@ def detect_datatype(source: Any) -> DataType:
     formatted_source = format_source(str(source), 30)
 
     if url:
-        from langchain.document_loaders.youtube import ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS
+        from langchain.document_loaders.youtube import \
+            ALLOWED_NETLOCK as YOUTUBE_ALLOWED_NETLOCS
 
         if url.netloc in YOUTUBE_ALLOWED_NETLOCS:
             logging.debug(f"Source of `{formatted_source}` detected as `youtube_video`.")

+ 1 - 1
pyproject.toml

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "embedchain"
-version = "0.1.30"
+version = "0.1.31"
 description = "Data platform for LLMs - Load, index, retrieve and sync any unstructured data"
 authors = [
     "Taranjeet Singh <taranjeet@embedchain.ai>",