Pārlūkot izejas kodu

Speed up docs site loader (#1266)

Saurabh Misra 1 gadu atpakaļ
vecāks
revīzija
940f0128d5
1 mainītis faili ar 9 papildinājumiem un 7 dzēšanām
  1. 9 7
      embedchain/loaders/docs_site_loader.py

+ 9 - 7
embedchain/loaders/docs_site_loader.py

@@ -24,6 +24,9 @@ class DocsSiteLoader(BaseLoader):
         self.visited_links = set()
 
     def _get_child_links_recursive(self, url):
+        if url in self.visited_links:
+            return
+
         parsed_url = urlparse(url)
         base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
         current_path = parsed_url.path
@@ -34,16 +37,15 @@ class DocsSiteLoader(BaseLoader):
             return
 
         soup = BeautifulSoup(response.text, "html.parser")
-        all_links = [link.get("href") for link in soup.find_all("a")]
+        all_links = (link.get("href") for link in soup.find_all("a", href=True))
+
+        child_links = (link for link in all_links if link.startswith(current_path) and link != current_path)
 
-        child_links = [link for link in all_links if link and link.startswith(current_path) and link != current_path]
+        absolute_paths = set(urljoin(base_url, link) for link in child_links)
 
-        absolute_paths = [urljoin(base_url, link) for link in child_links]
+        self.visited_links.update(absolute_paths)
 
-        for link in absolute_paths:
-            if link not in self.visited_links:
-                self.visited_links.add(link)
-                self._get_child_links_recursive(link)
+        [self._get_child_links_recursive(link) for link in absolute_paths if link not in self.visited_links]
 
     def _get_all_urls(self, url):
         self.visited_links = set()