import requests from bs4 import BeautifulSoup from embedchain.utils import clean_string class CodeDocsPageLoader: def load_data(self, url): """Load data from a web page.""" response = requests.get(url) data = response.content soup = BeautifulSoup(data, "html.parser") selectors = [ "article.bd-article", 'article[role="main"]', "div.md-content", 'div[role="main"]', "div.container", "div.section", "article", "main", ] content = None for selector in selectors: element = soup.select_one(selector) if element is not None: content = element.prettify() break if not content: content = soup.get_text() soup = BeautifulSoup(content, "html.parser") for tag in soup( [ "nav", "aside", "form", "header", "noscript", "svg", "canvas", "footer", "script", "style", ] ): tag.string = " " for div in soup.find_all("div", {"class": "cell_output"}): div.decompose() for div in soup.find_all("div", {"class": "output_wrapper"}): div.decompose() for div in soup.find_all("div", {"class": "output"}): div.decompose() content = clean_string(soup.get_text()) output = [] meta_data = { "url": url, } output.append( { "content": content, "meta_data": meta_data, } ) return output