web_page.py 774 B

12345678910111213141516171819202122232425262728293031
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from embedchain.utils import clean_string
  4. class WebPageLoader:
  5. def load_data(self, url):
  6. ''' Load data from a web page. '''
  7. response = requests.get(url)
  8. data = response.content
  9. soup = BeautifulSoup(data, 'html.parser')
  10. for tag in soup([
  11. "nav", "aside", "form", "header",
  12. "noscript", "svg", "canvas",
  13. "footer", "script", "style"
  14. ]):
  15. tag.string = " "
  16. output = []
  17. content = soup.get_text()
  18. content = clean_string(content)
  19. meta_data = {
  20. "url": url,
  21. }
  22. output.append({
  23. "content": content,
  24. "meta_data": meta_data,
  25. })
  26. return output