Jelajahi Sumber

feat: exclude by class, id in web_page data type and add logging (#273)

cachho 2 tahun lalu
induk
melakukan
addf1c0666
1 mengubah file dengan 47 tambahan dan 19 penghapusan
  1. 47 19
      embedchain/loaders/web_page.py

+ 47 - 19
embedchain/loaders/web_page.py

@@ -1,3 +1,5 @@
+import logging
+
 import requests
 from bs4 import BeautifulSoup
 
@@ -10,31 +12,57 @@ class WebPageLoader:
         response = requests.get(url)
         data = response.content
         soup = BeautifulSoup(data, "html.parser")
-        for tag in soup(
-            [
-                "nav",
-                "aside",
-                "form",
-                "header",
-                "noscript",
-                "svg",
-                "canvas",
-                "footer",
-                "script",
-                "style",
-            ]
-        ):
-            tag.string = " "
-        output = []
+        original_size = len(str(soup.get_text()))
+
+        tags_to_exclude = [
+            "nav",
+            "aside",
+            "form",
+            "header",
+            "noscript",
+            "svg",
+            "canvas",
+            "footer",
+            "script",
+            "style",
+        ]
+        for tag in soup(tags_to_exclude):
+            tag.decompose()
+
+        ids_to_exclude = ["sidebar", "main-navigation", "menu-main-menu"]
+        for id in ids_to_exclude:
+            tags = soup.find_all(id=id)
+            for tag in tags:
+                tag.decompose()
+
+        classes_to_exclude = [
+            "elementor-location-header",
+            "navbar-header",
+            "nav",
+            "header-sidebar-wrapper",
+            "blog-sidebar-wrapper",
+            "related-posts",
+        ]
+        for class_name in classes_to_exclude:
+            tags = soup.find_all(class_=class_name)
+            for tag in tags:
+                tag.decompose()
+
         content = soup.get_text()
         content = clean_string(content)
+
+        cleaned_size = len(content)
+        logging.info(
+            f"[{url}] Cleaned page size: {cleaned_size} characters, down from {original_size} (shrunk: {original_size-cleaned_size} chars, {round((1-(cleaned_size/original_size)) * 100, 2)}%)"  # noqa:E501
+        )
+
         meta_data = {
             "url": url,
         }
-        output.append(
+
+        return [
             {
                 "content": content,
                 "meta_data": meta_data,
             }
-        )
-        return output
+        ]