import hashlib
from unittest.mock import Mock, patch
import pytest
from embedchain.loaders.web_page import WebPageLoader
@pytest.fixture
def web_page_loader():
return WebPageLoader()
def test_load_data(web_page_loader):
page_url = "https://example.com/page"
mock_response = Mock()
mock_response.status_code = 200
mock_response.content = """
This is some test content.
"""
with patch("embedchain.loaders.web_page.WebPageLoader._session.get", return_value=mock_response):
result = web_page_loader.load_data(page_url)
content = web_page_loader._get_clean_content(mock_response.content, page_url)
expected_doc_id = hashlib.sha256((content + page_url).encode()).hexdigest()
assert result["doc_id"] == expected_doc_id
expected_data = [
{
"content": content,
"meta_data": {
"url": page_url,
},
}
]
assert result["data"] == expected_data
def test_get_clean_content_excludes_unnecessary_info(web_page_loader):
mock_html = """