docs_site_loader.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. import logging
  2. from urllib.parse import urljoin, urlparse
  3. import requests
  4. from bs4 import BeautifulSoup
  5. class DocsSiteLoader:
  6. def __init__(self):
  7. self.visited_links = set()
  8. def _get_child_links_recursive(self, url):
  9. parsed_url = urlparse(url)
  10. base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
  11. current_path = parsed_url.path
  12. response = requests.get(url)
  13. if response.status_code != 200:
  14. logging.info(f"Failed to fetch the website: {response.status_code}")
  15. return
  16. soup = BeautifulSoup(response.text, "html.parser")
  17. all_links = [link.get("href") for link in soup.find_all("a")]
  18. child_links = [link for link in all_links if link and link.startswith(current_path) and link != current_path]
  19. absolute_paths = [urljoin(base_url, link) for link in child_links]
  20. for link in absolute_paths:
  21. if link not in self.visited_links:
  22. self.visited_links.add(link)
  23. self._get_child_links_recursive(link)
  24. def _get_all_urls(self, url):
  25. self.visited_links = set()
  26. self._get_child_links_recursive(url)
  27. urls = [link for link in self.visited_links if urlparse(link).netloc == urlparse(url).netloc]
  28. return urls
  29. def _load_data_from_url(self, url):
  30. response = requests.get(url)
  31. if response.status_code != 200:
  32. logging.info(f"Failed to fetch the website: {response.status_code}")
  33. return []
  34. soup = BeautifulSoup(response.content, "html.parser")
  35. selectors = [
  36. "article.bd-article",
  37. 'article[role="main"]',
  38. "div.md-content",
  39. 'div[role="main"]',
  40. "div.container",
  41. "div.section",
  42. "article",
  43. "main",
  44. ]
  45. output = []
  46. for selector in selectors:
  47. element = soup.select_one(selector)
  48. if element:
  49. content = element.prettify()
  50. break
  51. else:
  52. content = soup.get_text()
  53. soup = BeautifulSoup(content, "html.parser")
  54. ignored_tags = [
  55. "nav",
  56. "aside",
  57. "form",
  58. "header",
  59. "noscript",
  60. "svg",
  61. "canvas",
  62. "footer",
  63. "script",
  64. "style",
  65. ]
  66. for tag in soup(ignored_tags):
  67. tag.decompose()
  68. content = " ".join(soup.stripped_strings)
  69. output.append(
  70. {
  71. "content": content,
  72. "meta_data": {"url": url},
  73. }
  74. )
  75. return output
  76. def load_data(self, url):
  77. all_urls = self._get_all_urls(url)
  78. output = []
  79. for u in all_urls:
  80. output.extend(self._load_data_from_url(u))
  81. return output