docs_site_loader.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. import logging
  2. from urllib.parse import urljoin, urlparse
  3. import requests
  4. from bs4 import BeautifulSoup
  5. from embedchain.helper_classes.json_serializable import register_deserializable
  6. from embedchain.loaders.base_loader import BaseLoader
  7. @register_deserializable
  8. class DocsSiteLoader(BaseLoader):
  9. def __init__(self):
  10. self.visited_links = set()
  11. def _get_child_links_recursive(self, url):
  12. parsed_url = urlparse(url)
  13. base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
  14. current_path = parsed_url.path
  15. response = requests.get(url)
  16. if response.status_code != 200:
  17. logging.info(f"Failed to fetch the website: {response.status_code}")
  18. return
  19. soup = BeautifulSoup(response.text, "html.parser")
  20. all_links = [link.get("href") for link in soup.find_all("a")]
  21. child_links = [link for link in all_links if link and link.startswith(current_path) and link != current_path]
  22. absolute_paths = [urljoin(base_url, link) for link in child_links]
  23. for link in absolute_paths:
  24. if link not in self.visited_links:
  25. self.visited_links.add(link)
  26. self._get_child_links_recursive(link)
  27. def _get_all_urls(self, url):
  28. self.visited_links = set()
  29. self._get_child_links_recursive(url)
  30. urls = [link for link in self.visited_links if urlparse(link).netloc == urlparse(url).netloc]
  31. return urls
  32. def _load_data_from_url(self, url):
  33. response = requests.get(url)
  34. if response.status_code != 200:
  35. logging.info(f"Failed to fetch the website: {response.status_code}")
  36. return []
  37. soup = BeautifulSoup(response.content, "html.parser")
  38. selectors = [
  39. "article.bd-article",
  40. 'article[role="main"]',
  41. "div.md-content",
  42. 'div[role="main"]',
  43. "div.container",
  44. "div.section",
  45. "article",
  46. "main",
  47. ]
  48. output = []
  49. for selector in selectors:
  50. element = soup.select_one(selector)
  51. if element:
  52. content = element.prettify()
  53. break
  54. else:
  55. content = soup.get_text()
  56. soup = BeautifulSoup(content, "html.parser")
  57. ignored_tags = [
  58. "nav",
  59. "aside",
  60. "form",
  61. "header",
  62. "noscript",
  63. "svg",
  64. "canvas",
  65. "footer",
  66. "script",
  67. "style",
  68. ]
  69. for tag in soup(ignored_tags):
  70. tag.decompose()
  71. content = " ".join(soup.stripped_strings)
  72. output.append(
  73. {
  74. "content": content,
  75. "meta_data": {"url": url},
  76. }
  77. )
  78. return output
  79. def load_data(self, url):
  80. all_urls = self._get_all_urls(url)
  81. output = []
  82. for u in all_urls:
  83. output.extend(self._load_data_from_url(u))
  84. return output