123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- import hashlib
- from unittest.mock import Mock, patch
- import pytest
- from requests import Response
- from embedchain.loaders.docs_site_loader import DocsSiteLoader
- @pytest.fixture
- def mock_requests_get():
- with patch("requests.get") as mock_get:
- yield mock_get
- @pytest.fixture
- def docs_site_loader():
- return DocsSiteLoader()
- def test_get_child_links_recursive(mock_requests_get, docs_site_loader):
- mock_response = Mock()
- mock_response.status_code = 200
- mock_response.text = """
- <html>
- <a href="/page1">Page 1</a>
- <a href="/page2">Page 2</a>
- </html>
- """
- mock_requests_get.return_value = mock_response
- docs_site_loader._get_child_links_recursive("https://example.com")
- assert len(docs_site_loader.visited_links) == 2
- assert "https://example.com/page1" in docs_site_loader.visited_links
- assert "https://example.com/page2" in docs_site_loader.visited_links
- def test_get_child_links_recursive_status_not_200(mock_requests_get, docs_site_loader):
- mock_response = Mock()
- mock_response.status_code = 404
- mock_requests_get.return_value = mock_response
- docs_site_loader._get_child_links_recursive("https://example.com")
- assert len(docs_site_loader.visited_links) == 0
- def test_get_all_urls(mock_requests_get, docs_site_loader):
- mock_response = Mock()
- mock_response.status_code = 200
- mock_response.text = """
- <html>
- <a href="/page1">Page 1</a>
- <a href="/page2">Page 2</a>
- <a href="https://example.com/external">External</a>
- </html>
- """
- mock_requests_get.return_value = mock_response
- all_urls = docs_site_loader._get_all_urls("https://example.com")
- assert len(all_urls) == 3
- assert "https://example.com/page1" in all_urls
- assert "https://example.com/page2" in all_urls
- assert "https://example.com/external" in all_urls
- def test_load_data_from_url(mock_requests_get, docs_site_loader):
- mock_response = Mock()
- mock_response.status_code = 200
- mock_response.content = """
- <html>
- <nav>
- <h1>Navigation</h1>
- </nav>
- <article class="bd-article">
- <p>Article Content</p>
- </article>
- </html>
- """.encode()
- mock_requests_get.return_value = mock_response
- data = docs_site_loader._load_data_from_url("https://example.com/page1")
- assert len(data) == 1
- assert data[0]["content"] == "Article Content"
- assert data[0]["meta_data"]["url"] == "https://example.com/page1"
- def test_load_data_from_url_status_not_200(mock_requests_get, docs_site_loader):
- mock_response = Mock()
- mock_response.status_code = 404
- mock_requests_get.return_value = mock_response
- data = docs_site_loader._load_data_from_url("https://example.com/page1")
- assert data == []
- assert len(data) == 0
- def test_load_data(mock_requests_get, docs_site_loader):
- mock_response = Response()
- mock_response.status_code = 200
- mock_response._content = """
- <html>
- <a href="/page1">Page 1</a>
- <a href="/page2">Page 2</a>
- """.encode()
- mock_requests_get.return_value = mock_response
- url = "https://example.com"
- data = docs_site_loader.load_data(url)
- expected_doc_id = hashlib.sha256((" ".join(docs_site_loader.visited_links) + url).encode()).hexdigest()
- assert len(data["data"]) == 2
- assert data["doc_id"] == expected_doc_id
- def test_if_response_status_not_200(mock_requests_get, docs_site_loader):
- mock_response = Response()
- mock_response.status_code = 404
- mock_requests_get.return_value = mock_response
- url = "https://example.com"
- data = docs_site_loader.load_data(url)
- expected_doc_id = hashlib.sha256((" ".join(docs_site_loader.visited_links) + url).encode()).hexdigest()
- assert len(data["data"]) == 0
- assert data["doc_id"] == expected_doc_id
|