123456789101112131415161718192021222324252627282930 |
- import requests
- from bs4 import BeautifulSoup
- from embedchain.utils import clean_string
- class WebPageLoader:
- def load_data(self, url):
- response = requests.get(url)
- data = response.content
- soup = BeautifulSoup(data, 'html.parser')
- for tag in soup([
- "nav", "aside", "form", "header",
- "noscript", "svg", "canvas",
- "footer", "script", "style"
- ]):
- tag.string = " "
- output = []
- content = soup.get_text()
- content = clean_string(content)
- meta_data = {
- "url": url,
- }
- output.append({
- "content": content,
- "meta_data": meta_data,
- })
- return output
|