web_page.py 956 B

12345678910111213141516171819202122232425262728293031323334353637383940
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from embedchain.utils import clean_string
  4. class WebPageLoader:
  5. def load_data(self, url):
  6. """Load data from a web page."""
  7. response = requests.get(url)
  8. data = response.content
  9. soup = BeautifulSoup(data, "html.parser")
  10. for tag in soup(
  11. [
  12. "nav",
  13. "aside",
  14. "form",
  15. "header",
  16. "noscript",
  17. "svg",
  18. "canvas",
  19. "footer",
  20. "script",
  21. "style",
  22. ]
  23. ):
  24. tag.string = " "
  25. output = []
  26. content = soup.get_text()
  27. content = clean_string(content)
  28. meta_data = {
  29. "url": url,
  30. }
  31. output.append(
  32. {
  33. "content": content,
  34. "meta_data": meta_data,
  35. }
  36. )
  37. return output