web_page.py 731 B

123456789101112131415161718192021222324252627282930
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from embedchain.utils import clean_string
  4. class WebPageLoader:
  5. def load_data(self, url):
  6. response = requests.get(url)
  7. data = response.content
  8. soup = BeautifulSoup(data, 'html.parser')
  9. for tag in soup([
  10. "nav", "aside", "form", "header",
  11. "noscript", "svg", "canvas",
  12. "footer", "script", "style"
  13. ]):
  14. tag.string = " "
  15. output = []
  16. content = soup.get_text()
  17. content = clean_string(content)
  18. meta_data = {
  19. "url": url,
  20. }
  21. output.append({
  22. "content": content,
  23. "meta_data": meta_data,
  24. })
  25. return output