docs_site_loader.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. import hashlib
  2. import logging
  3. from urllib.parse import urljoin, urlparse
  4. import requests
  5. from bs4 import BeautifulSoup
  6. from embedchain.helper.json_serializable import register_deserializable
  7. from embedchain.loaders.base_loader import BaseLoader
  8. @register_deserializable
  9. class DocsSiteLoader(BaseLoader):
  10. def __init__(self):
  11. self.visited_links = set()
  12. def _get_child_links_recursive(self, url):
  13. parsed_url = urlparse(url)
  14. base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
  15. current_path = parsed_url.path
  16. response = requests.get(url)
  17. if response.status_code != 200:
  18. logging.info(f"Failed to fetch the website: {response.status_code}")
  19. return
  20. soup = BeautifulSoup(response.text, "html.parser")
  21. all_links = [link.get("href") for link in soup.find_all("a")]
  22. child_links = [link for link in all_links if link and link.startswith(current_path) and link != current_path]
  23. absolute_paths = [urljoin(base_url, link) for link in child_links]
  24. for link in absolute_paths:
  25. if link not in self.visited_links:
  26. self.visited_links.add(link)
  27. self._get_child_links_recursive(link)
  28. def _get_all_urls(self, url):
  29. self.visited_links = set()
  30. self._get_child_links_recursive(url)
  31. urls = [link for link in self.visited_links if urlparse(link).netloc == urlparse(url).netloc]
  32. return urls
  33. def _load_data_from_url(self, url):
  34. response = requests.get(url)
  35. if response.status_code != 200:
  36. logging.info(f"Failed to fetch the website: {response.status_code}")
  37. return []
  38. soup = BeautifulSoup(response.content, "html.parser")
  39. selectors = [
  40. "article.bd-article",
  41. 'article[role="main"]',
  42. "div.md-content",
  43. 'div[role="main"]',
  44. "div.container",
  45. "div.section",
  46. "article",
  47. "main",
  48. ]
  49. output = []
  50. for selector in selectors:
  51. element = soup.select_one(selector)
  52. if element:
  53. content = element.prettify()
  54. break
  55. else:
  56. content = soup.get_text()
  57. soup = BeautifulSoup(content, "html.parser")
  58. ignored_tags = [
  59. "nav",
  60. "aside",
  61. "form",
  62. "header",
  63. "noscript",
  64. "svg",
  65. "canvas",
  66. "footer",
  67. "script",
  68. "style",
  69. ]
  70. for tag in soup(ignored_tags):
  71. tag.decompose()
  72. content = " ".join(soup.stripped_strings)
  73. output.append(
  74. {
  75. "content": content,
  76. "meta_data": {"url": url},
  77. }
  78. )
  79. return output
  80. def load_data(self, url):
  81. all_urls = self._get_all_urls(url)
  82. output = []
  83. for u in all_urls:
  84. output.extend(self._load_data_from_url(u))
  85. doc_id = hashlib.sha256((" ".join(all_urls) + url).encode()).hexdigest()
  86. return {
  87. "doc_id": doc_id,
  88. "data": output,
  89. }