import hashlib from unittest.mock import Mock, patch import pytest from embedchain.loaders.web_page import WebPageLoader @pytest.fixture def web_page_loader(): return WebPageLoader() def test_load_data(web_page_loader): page_url = "https://example.com/page" mock_response = Mock() mock_response.status_code = 200 mock_response.content = """ Test Page

This is some test content.

""" with patch("embedchain.loaders.web_page.WebPageLoader._session.get", return_value=mock_response): result = web_page_loader.load_data(page_url) content = web_page_loader._get_clean_content(mock_response.content, page_url) expected_doc_id = hashlib.sha256((content + page_url).encode()).hexdigest() assert result["doc_id"] == expected_doc_id expected_data = [ { "content": content, "meta_data": { "url": page_url, }, } ] assert result["data"] == expected_data def test_get_clean_content_excludes_unnecessary_info(web_page_loader): mock_html = """ Sample HTML
Form Content
Main Content
SVG Content Canvas Content
Header Sidebar Wrapper Content
Blog Sidebar Wrapper Content
""" tags_to_exclude = [ "nav", "aside", "form", "header", "noscript", "svg", "canvas", "footer", "script", "style", ] ids_to_exclude = ["sidebar", "main-navigation", "menu-main-menu"] classes_to_exclude = [ "elementor-location-header", "navbar-header", "nav", "header-sidebar-wrapper", "blog-sidebar-wrapper", "related-posts", ] content = web_page_loader._get_clean_content(mock_html, "https://example.com/page") for tag in tags_to_exclude: assert tag not in content for id in ids_to_exclude: assert id not in content for class_name in classes_to_exclude: assert class_name not in content assert len(content) > 0