Ver Fonte

Feat: Improve test coverage of DocsSiteLoader (#758)

SerSamgy há 1 ano atrás
pai
commit
431f8c2c6a
2 ficheiros alterados com 219 adições e 0 exclusões
  1. 1 0
      pyproject.toml
  2. 218 0
      tests/loaders/test_docs_site_loader.py

+ 1 - 0
pyproject.toml

@@ -123,6 +123,7 @@ pytest-env = "^0.8.1"
 click = "^8.1.3"
 isort = "^5.12.0"
 pytest-cov = "^4.1.0"
+responses = "^0.23.3"
 
 [tool.poetry.extras]
 streamlit = ["streamlit"]

+ 218 - 0
tests/loaders/test_docs_site_loader.py

@@ -0,0 +1,218 @@
+import pytest
+import responses
+from bs4 import BeautifulSoup
+
+
+@pytest.mark.parametrize(
+    "ignored_tag",
+    [
+        "<nav>This is a navigation bar.</nav>",
+        "<aside>This is an aside.</aside>",
+        "<form>This is a form.</form>",
+        "<header>This is a header.</header>",
+        "<noscript>This is a noscript.</noscript>",
+        "<svg>This is an SVG.</svg>",
+        "<canvas>This is a canvas.</canvas>",
+        "<footer>This is a footer.</footer>",
+        "<script>This is a script.</script>",
+        "<style>This is a style.</style>",
+    ],
+    ids=["nav", "aside", "form", "header", "noscript", "svg", "canvas", "footer", "script", "style"],
+)
+@pytest.mark.parametrize(
+    "selectee",
+    [
+        """
+<article class="bd-article">
+    <h2>Article Title</h2>
+    <p>Article content goes here.</p>
+    {ignored_tag}
+</article>
+""",
+        """
+<article role="main">
+    <h2>Main Article Title</h2>
+    <p>Main article content goes here.</p>
+    {ignored_tag}
+</article>
+""",
+        """
+<div class="md-content">
+    <h2>Markdown Content</h2>
+    <p>Markdown content goes here.</p>
+    {ignored_tag}
+</div>
+""",
+        """
+<div role="main">
+    <h2>Main Content</h2>
+    <p>Main content goes here.</p>
+    {ignored_tag}
+</div>
+""",
+        """
+<div class="container">
+    <h2>Container</h2>
+    <p>Container content goes here.</p>
+    {ignored_tag}
+</div>
+        """,
+        """
+<div class="section">
+    <h2>Section</h2>
+    <p>Section content goes here.</p>
+    {ignored_tag}
+</div>
+        """,
+        """
+<article>
+    <h2>Generic Article</h2>
+    <p>Generic article content goes here.</p>
+    {ignored_tag}
+</article>
+        """,
+        """
+<main>
+    <h2>Main Content</h2>
+    <p>Main content goes here.</p>
+    {ignored_tag}
+</main>
+""",
+    ],
+    ids=[
+        "article.bd-article",
+        'article[role="main"]',
+        "div.md-content",
+        'div[role="main"]',
+        "div.container",
+        "div.section",
+        "article",
+        "main",
+    ],
+)
+def test_load_data_gets_by_selectors_and_ignored_tags(selectee, ignored_tag, loader, mocked_responses, mocker):
+    child_url = "https://docs.embedchain.ai/quickstart"
+    selectee = selectee.format(ignored_tag=ignored_tag)
+    html_body = """
+<!DOCTYPE html>
+<html lang="en">
+<body>
+    {selectee}
+</body>
+</html>
+"""
+    html_body = html_body.format(selectee=selectee)
+    mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
+
+    url = "https://docs.embedchain.ai/"
+    html_body = """
+<!DOCTYPE html>
+<html lang="en">
+<body>
+    <li><a href="/quickstart">Quickstart</a></li>
+</body>
+</html>
+"""
+    mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
+
+    mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
+    doc_id = "mocked_hash"
+    mock_sha256.return_value.hexdigest.return_value = doc_id
+
+    result = loader.load_data(url)
+    selector_soup = BeautifulSoup(selectee, "html.parser")
+    expected_content = " ".join((selector_soup.select_one("h2").get_text(), selector_soup.select_one("p").get_text()))
+    assert result["doc_id"] == doc_id
+    assert result["data"] == [
+        {
+            "content": expected_content,
+            "meta_data": {"url": "https://docs.embedchain.ai/quickstart"},
+        }
+    ]
+
+
+def test_load_data_gets_child_links_recursively(loader, mocked_responses, mocker):
+    child_url = "https://docs.embedchain.ai/quickstart"
+    html_body = """
+<!DOCTYPE html>
+<html lang="en">
+<body>
+    <li><a href="/">..</a></li>
+    <li><a href="/quickstart">.</a></li>
+</body>
+</html>
+"""
+    mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
+
+    child_url = "https://docs.embedchain.ai/introduction"
+    html_body = """
+<!DOCTYPE html>
+<html lang="en">
+<body>
+    <li><a href="/">..</a></li>
+    <li><a href="/introduction">.</a></li>
+</body>
+</html>
+"""
+    mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
+
+    url = "https://docs.embedchain.ai/"
+    html_body = """
+<!DOCTYPE html>
+<html lang="en">
+<body>
+    <li><a href="/quickstart">Quickstart</a></li>
+    <li><a href="/introduction">Introduction</a></li>
+</body>
+</html>
+"""
+    mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
+
+    mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
+    doc_id = "mocked_hash"
+    mock_sha256.return_value.hexdigest.return_value = doc_id
+
+    result = loader.load_data(url)
+    assert result["doc_id"] == doc_id
+    expected_data = [
+        {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}},
+        {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/introduction"}},
+    ]
+    assert all(item in expected_data for item in result["data"])
+
+
+def test_load_data_fails_to_fetch_website(loader, mocked_responses, mocker):
+    child_url = "https://docs.embedchain.ai/introduction"
+    mocked_responses.get(child_url, status=404)
+
+    url = "https://docs.embedchain.ai/"
+    html_body = """
+<!DOCTYPE html>
+<html lang="en">
+<body>
+    <li><a href="/introduction">Introduction</a></li>
+</body>
+</html>
+"""
+    mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
+
+    mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
+    doc_id = "mocked_hash"
+    mock_sha256.return_value.hexdigest.return_value = doc_id
+
+    result = loader.load_data(url)
+    assert result["doc_id"] is doc_id
+    assert result["data"] == []
+
+
+@pytest.fixture
+def loader():
+    from embedchain.loaders.docs_site_loader import DocsSiteLoader
+
+    return DocsSiteLoader()
+
+
+@pytest.fixture
+def mocked_responses():
+    with responses.RequestsMock() as rsps:
+        yield rsps