|
@@ -16,7 +16,6 @@ except ImportError:
|
|
|
from embedchain.helpers.json_serializable import register_deserializable
|
|
|
from embedchain.loaders.base_loader import BaseLoader
|
|
|
from embedchain.loaders.web_page import WebPageLoader
|
|
|
-from embedchain.utils import is_readable
|
|
|
|
|
|
|
|
|
@register_deserializable
|
|
@@ -40,19 +39,16 @@ class SitemapLoader(BaseLoader):
|
|
|
|
|
|
doc_id = hashlib.sha256((" ".join(links) + sitemap_url).encode()).hexdigest()
|
|
|
|
|
|
- def load_link(link):
|
|
|
+ def load_web_page(link):
|
|
|
try:
|
|
|
- each_load_data = web_page_loader.load_data(link)
|
|
|
- if is_readable(each_load_data.get("data")[0].get("content")):
|
|
|
- return each_load_data.get("data")
|
|
|
- else:
|
|
|
- logging.warning(f"Page is not readable (too many invalid characters): {link}")
|
|
|
+ loader_data = web_page_loader.load_data(link)
|
|
|
+ return loader_data.get("data")
|
|
|
except ParserRejectedMarkup as e:
|
|
|
logging.error(f"Failed to parse {link}: {e}")
|
|
|
return None
|
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
|
- future_to_link = {executor.submit(load_link, link): link for link in links}
|
|
|
+ future_to_link = {executor.submit(load_web_page, link): link for link in links}
|
|
|
for future in tqdm(concurrent.futures.as_completed(future_to_link), total=len(links), desc="Loading pages"):
|
|
|
link = future_to_link[future]
|
|
|
try:
|