code_docs_page.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from embedchain.utils import clean_string
  4. class CodeDocsPageLoader:
  5. def load_data(self, url):
  6. """Load data from a web page."""
  7. response = requests.get(url)
  8. data = response.content
  9. soup = BeautifulSoup(data, "html.parser")
  10. selectors = [
  11. "article.bd-article",
  12. 'article[role="main"]',
  13. "div.md-content",
  14. 'div[role="main"]',
  15. "div.container",
  16. "div.section",
  17. "article",
  18. "main",
  19. ]
  20. content = None
  21. for selector in selectors:
  22. element = soup.select_one(selector)
  23. if element is not None:
  24. content = element.prettify()
  25. break
  26. if not content:
  27. content = soup.get_text()
  28. soup = BeautifulSoup(content, "html.parser")
  29. for tag in soup(
  30. [
  31. "nav",
  32. "aside",
  33. "form",
  34. "header",
  35. "noscript",
  36. "svg",
  37. "canvas",
  38. "footer",
  39. "script",
  40. "style",
  41. ]
  42. ):
  43. tag.string = " "
  44. for div in soup.find_all("div", {"class": "cell_output"}):
  45. div.decompose()
  46. for div in soup.find_all("div", {"class": "output_wrapper"}):
  47. div.decompose()
  48. for div in soup.find_all("div", {"class": "output"}):
  49. div.decompose()
  50. content = clean_string(soup.get_text())
  51. output = []
  52. meta_data = {
  53. "url": url,
  54. }
  55. output.append(
  56. {
  57. "content": content,
  58. "meta_data": meta_data,
  59. }
  60. )
  61. return output