123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263 |
- import requests
- from bs4 import BeautifulSoup
- from embedchain.utils import clean_string
- class CodeDocsPageLoader:
- def load_data(self, url):
- """Load data from a web page."""
- response = requests.get(url)
- data = response.content
- soup = BeautifulSoup(data, "html.parser")
- selectors = [
- 'article.bd-article',
- 'article[role="main"]',
- 'div.md-content',
- 'div[role="main"]',
- 'div.container',
- 'div.section',
- 'article',
- 'main',
- ]
- content = None
- for selector in selectors:
- element = soup.select_one(selector)
- if element is not None:
- content = element.prettify()
- break
- if not content:
- content = soup.get_text()
- soup = BeautifulSoup(content, "html.parser")
- for tag in soup(
- [
- "nav",
- "aside",
- "form",
- "header",
- "noscript",
- "svg",
- "canvas",
- "footer",
- "script",
- "style",
- ]
- ):
- tag.string = " "
- for div in soup.find_all("div", {'class': 'cell_output'}):
- div.decompose()
- for div in soup.find_all("div", {'class': 'output_wrapper'}):
- div.decompose()
- for div in soup.find_all("div", {'class': 'output'}):
- div.decompose()
- content = clean_string(soup.get_text())
- output = []
- meta_data = {
- "url": url,
- }
- output.append(
- {
- "content": content,
- "meta_data": meta_data,
- }
- )
- return output
|