12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- import requests
- from bs4 import BeautifulSoup
- from embedchain.utils import clean_string
- class CodeDocsPageLoader:
- def load_data(self, url):
- """Load data from a web page."""
- response = requests.get(url)
- data = response.content
- soup = BeautifulSoup(data, "html.parser")
- selectors = [
- "article.bd-article",
- 'article[role="main"]',
- "div.md-content",
- 'div[role="main"]',
- "div.container",
- "div.section",
- "article",
- "main",
- ]
- content = None
- for selector in selectors:
- element = soup.select_one(selector)
- if element is not None:
- content = element.prettify()
- break
- if not content:
- content = soup.get_text()
- soup = BeautifulSoup(content, "html.parser")
- for tag in soup(
- [
- "nav",
- "aside",
- "form",
- "header",
- "noscript",
- "svg",
- "canvas",
- "footer",
- "script",
- "style",
- ]
- ):
- tag.string = " "
- for div in soup.find_all("div", {"class": "cell_output"}):
- div.decompose()
- for div in soup.find_all("div", {"class": "output_wrapper"}):
- div.decompose()
- for div in soup.find_all("div", {"class": "output"}):
- div.decompose()
- content = clean_string(soup.get_text())
- output = []
- meta_data = {
- "url": url,
- }
- output.append(
- {
- "content": content,
- "meta_data": meta_data,
- }
- )
- return output
|