Преглед изворни кода

[Loaders] Improve web page and sitemap loader usability (#961)

Deshraj Yadav пре 1 година
родитељ
комит
e0b73e6a5a
3 измењених фајлова са 14 додато и 7 уклоњено
  1. 2 1
      embedchain/loaders/sitemap.py
  2. 11 5
      embedchain/loaders/web_page.py
  3. 1 1
      tests/loaders/test_web_page.py

+ 2 - 1
embedchain/loaders/sitemap.py

@@ -3,6 +3,7 @@ import hashlib
 import logging
 
 import requests
+from tqdm import tqdm
 
 try:
     from bs4 import BeautifulSoup
@@ -52,7 +53,7 @@ class SitemapLoader(BaseLoader):
 
         with concurrent.futures.ThreadPoolExecutor() as executor:
             future_to_link = {executor.submit(load_link, link): link for link in links}
-            for future in concurrent.futures.as_completed(future_to_link):
+            for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links)):
                 link = future_to_link[future]
                 try:
                     data = future.result()

+ 11 - 5
embedchain/loaders/web_page.py

@@ -17,15 +17,17 @@ from embedchain.utils import clean_string
 
 @register_deserializable
 class WebPageLoader(BaseLoader):
+    # Shared session for all instances
+    _session = requests.Session()
+
     def load_data(self, url):
-        """Load data from a web page."""
-        response = requests.get(url)
+        """Load data from a web page using a shared requests session."""
+        response = self._session.get(url, timeout=30)
+        response.raise_for_status()
         data = response.content
         content = self._get_clean_content(data, url)
 
-        meta_data = {
-            "url": url,
-        }
+        meta_data = {"url": url}
 
         doc_id = hashlib.sha256((content + url).encode()).hexdigest()
         return {
@@ -86,3 +88,7 @@ class WebPageLoader(BaseLoader):
             )
 
         return content
+
+    @classmethod
+    def close_session(cls):
+        cls._session.close()

+ 1 - 1
tests/loaders/test_web_page.py

@@ -27,7 +27,7 @@ def test_load_data(web_page_loader):
             </body>
         </html>
     """
-    with patch("embedchain.loaders.web_page.requests.get", return_value=mock_response):
+    with patch("embedchain.loaders.web_page.WebPageLoader._session.get", return_value=mock_response):
         result = web_page_loader.load_data(page_url)
 
     content = web_page_loader._get_clean_content(mock_response.content, page_url)