Ver Fonte

fix sitemap loader (#986)

Co-authored-by: Deven Patel <deven298@yahoo.com>
Deven Patel há 1 ano atrás
pai
commit
c9fbc2e7d6
1 ficheiros alterados com 4 adições e 8 exclusões
  1. 4 8
      embedchain/loaders/sitemap.py

+ 4 - 8
embedchain/loaders/sitemap.py

@@ -16,7 +16,6 @@ except ImportError:
 from embedchain.helpers.json_serializable import register_deserializable
 from embedchain.loaders.base_loader import BaseLoader
 from embedchain.loaders.web_page import WebPageLoader
-from embedchain.utils import is_readable
 
 
 @register_deserializable
@@ -40,19 +39,16 @@ class SitemapLoader(BaseLoader):
 
         doc_id = hashlib.sha256((" ".join(links) + sitemap_url).encode()).hexdigest()
 
-        def load_link(link):
+        def load_web_page(link):
             try:
-                each_load_data = web_page_loader.load_data(link)
-                if is_readable(each_load_data.get("data")[0].get("content")):
-                    return each_load_data.get("data")
-                else:
-                    logging.warning(f"Page is not readable (too many invalid characters): {link}")
+                loader_data = web_page_loader.load_data(link)
+                return loader_data.get("data")
             except ParserRejectedMarkup as e:
                 logging.error(f"Failed to parse {link}: {e}")
             return None
 
         with concurrent.futures.ThreadPoolExecutor() as executor:
-            future_to_link = {executor.submit(load_link, link): link for link in links}
+            future_to_link = {executor.submit(load_web_page, link): link for link in links}
             for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links), desc="Loading pages"):
                 link = future_to_link[future]
                 try: