há 2 anos atrás · 431f8c2c6a
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -123,6 +123,7 @@ pytest-env = "^0.8.1"
 
				 click = "^8.1.3"
			
 
				 isort = "^5.12.0"
			
 
				 pytest-cov = "^4.1.0"
			
 
				+responses = "^0.23.3"
			
 
				 
			
 
				 [tool.poetry.extras]
			
 
				 streamlit = ["streamlit"]
			
--- a/tests/loaders/test_docs_site_loader.py
+++ b/tests/loaders/test_docs_site_loader.py
@@ -0,0 +1,218 @@
 
				+import pytest
			
 
				+import responses
			
 
				+from bs4 import BeautifulSoup
			
 
				+
			
 
				+
			
 
				+@pytest.mark.parametrize(
			
 
				+    "ignored_tag",
			
 
				+    [
			
 
				+        "<nav>This is a navigation bar.</nav>",
			
 
				+        "<aside>This is an aside.</aside>",
			
 
				+        "<form>This is a form.</form>",
			
 
				+        "<header>This is a header.</header>",
			
 
				+        "<noscript>This is a noscript.</noscript>",
			
 
				+        "<svg>This is an SVG.</svg>",
			
 
				+        "<canvas>This is a canvas.</canvas>",
			
 
				+        "<footer>This is a footer.</footer>",
			
 
				+        "<script>This is a script.</script>",
			
 
				+        "<style>This is a style.</style>",
			
 
				+    ],
			
 
				+    ids=["nav", "aside", "form", "header", "noscript", "svg", "canvas", "footer", "script", "style"],
			
 
				+)
			
 
				+@pytest.mark.parametrize(
			
 
				+    "selectee",
			
 
				+    [
			
 
				+        """
			
 
				+<article class="bd-article">
			
 
				+    <h2>Article Title</h2>
			
 
				+    <p>Article content goes here.</p>
			
 
				+    {ignored_tag}
			
 
				+</article>
			
 
				+""",
			
 
				+        """
			
 
				+<article role="main">
			
 
				+    <h2>Main Article Title</h2>
			
 
				+    <p>Main article content goes here.</p>
			
 
				+    {ignored_tag}
			
 
				+</article>
			
 
				+""",
			
 
				+        """
			
 
				+<div class="md-content">
			
 
				+    <h2>Markdown Content</h2>
			
 
				+    <p>Markdown content goes here.</p>
			
 
				+    {ignored_tag}
			
 
				+</div>
			
 
				+""",
			
 
				+        """
			
 
				+<div role="main">
			
 
				+    <h2>Main Content</h2>
			
 
				+    <p>Main content goes here.</p>
			
 
				+    {ignored_tag}
			
 
				+</div>
			
 
				+""",
			
 
				+        """
			
 
				+<div class="container">
			
 
				+    <h2>Container</h2>
			
 
				+    <p>Container content goes here.</p>
			
 
				+    {ignored_tag}
			
 
				+</div>
			
 
				+        """,
			
 
				+        """
			
 
				+<div class="section">
			
 
				+    <h2>Section</h2>
			
 
				+    <p>Section content goes here.</p>
			
 
				+    {ignored_tag}
			
 
				+</div>
			
 
				+        """,
			
 
				+        """
			
 
				+<article>
			
 
				+    <h2>Generic Article</h2>
			
 
				+    <p>Generic article content goes here.</p>
			
 
				+    {ignored_tag}
			
 
				+</article>
			
 
				+        """,
			
 
				+        """
			
 
				+<main>
			
 
				+    <h2>Main Content</h2>
			
 
				+    <p>Main content goes here.</p>
			
 
				+    {ignored_tag}
			
 
				+</main>
			
 
				+""",
			
 
				+    ],
			
 
				+    ids=[
			
 
				+        "article.bd-article",
			
 
				+        'article[role="main"]',
			
 
				+        "div.md-content",
			
 
				+        'div[role="main"]',
			
 
				+        "div.container",
			
 
				+        "div.section",
			
 
				+        "article",
			
 
				+        "main",
			
 
				+    ],
			
 
				+)
			
 
				+def test_load_data_gets_by_selectors_and_ignored_tags(selectee, ignored_tag, loader, mocked_responses, mocker):
			
 
				+    child_url = "https://docs.embedchain.ai/quickstart"
			
 
				+    selectee = selectee.format(ignored_tag=ignored_tag)
			
 
				+    html_body = """
			
 
				+<!DOCTYPE html>
			
 
				+<html lang="en">
			
 
				+<body>
			
 
				+    {selectee}
			
 
				+</body>
			
 
				+</html>
			
 
				+"""
			
 
				+    html_body = html_body.format(selectee=selectee)
			
 
				+    mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
			
 
				+
			
 
				+    url = "https://docs.embedchain.ai/"
			
 
				+    html_body = """
			
 
				+<!DOCTYPE html>
			
 
				+<html lang="en">
			
 
				+<body>
			
 
				+    <li><a href="/quickstart">Quickstart</a></li>
			
 
				+</body>
			
 
				+</html>
			
 
				+"""
			
 
				+    mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
			
 
				+
			
 
				+    mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
			
 
				+    doc_id = "mocked_hash"
			
 
				+    mock_sha256.return_value.hexdigest.return_value = doc_id
			
 
				+
			
 
				+    result = loader.load_data(url)
			
 
				+    selector_soup = BeautifulSoup(selectee, "html.parser")
			
 
				+    expected_content = " ".join((selector_soup.select_one("h2").get_text(), selector_soup.select_one("p").get_text()))
			
 
				+    assert result["doc_id"] == doc_id
			
 
				+    assert result["data"] == [
			
 
				+        {
			
 
				+            "content": expected_content,
			
 
				+            "meta_data": {"url": "https://docs.embedchain.ai/quickstart"},
			
 
				+        }
			
 
				+    ]
			
 
				+
			
 
				+
			
 
				+def test_load_data_gets_child_links_recursively(loader, mocked_responses, mocker):
			
 
				+    child_url = "https://docs.embedchain.ai/quickstart"
			
 
				+    html_body = """
			
 
				+<!DOCTYPE html>
			
 
				+<html lang="en">
			
 
				+<body>
			
 
				+    <li><a href="/">..</a></li>
			
 
				+    <li><a href="/quickstart">.</a></li>
			
 
				+</body>
			
 
				+</html>
			
 
				+"""
			
 
				+    mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
			
 
				+
			
 
				+    child_url = "https://docs.embedchain.ai/introduction"
			
 
				+    html_body = """
			
 
				+<!DOCTYPE html>
			
 
				+<html lang="en">
			
 
				+<body>
			
 
				+    <li><a href="/">..</a></li>
			
 
				+    <li><a href="/introduction">.</a></li>
			
 
				+</body>
			
 
				+</html>
			
 
				+"""
			
 
				+    mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
			
 
				+
			
 
				+    url = "https://docs.embedchain.ai/"
			
 
				+    html_body = """
			
 
				+<!DOCTYPE html>
			
 
				+<html lang="en">
			
 
				+<body>
			
 
				+    <li><a href="/quickstart">Quickstart</a></li>
			
 
				+    <li><a href="/introduction">Introduction</a></li>
			
 
				+</body>
			
 
				+</html>
			
 
				+"""
			
 
				+    mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
			
 
				+
			
 
				+    mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
			
 
				+    doc_id = "mocked_hash"
			
 
				+    mock_sha256.return_value.hexdigest.return_value = doc_id
			
 
				+
			
 
				+    result = loader.load_data(url)
			
 
				+    assert result["doc_id"] == doc_id
			
 
				+    expected_data = [
			
 
				+        {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}},
			
 
				+        {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/introduction"}},
			
 
				+    ]
			
 
				+    assert all(item in expected_data for item in result["data"])
			
 
				+
			
 
				+
			
 
				+def test_load_data_fails_to_fetch_website(loader, mocked_responses, mocker):
			
 
				+    child_url = "https://docs.embedchain.ai/introduction"
			
 
				+    mocked_responses.get(child_url, status=404)
			
 
				+
			
 
				+    url = "https://docs.embedchain.ai/"
			
 
				+    html_body = """
			
 
				+<!DOCTYPE html>
			
 
				+<html lang="en">
			
 
				+<body>
			
 
				+    <li><a href="/introduction">Introduction</a></li>
			
 
				+</body>
			
 
				+</html>
			
 
				+"""
			
 
				+    mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
			
 
				+
			
 
				+    mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
			
 
				+    doc_id = "mocked_hash"
			
 
				+    mock_sha256.return_value.hexdigest.return_value = doc_id
			
 
				+
			
 
				+    result = loader.load_data(url)
			
 
				+    assert result["doc_id"] is doc_id
			
 
				+    assert result["data"] == []
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def loader():
			
 
				+    from embedchain.loaders.docs_site_loader import DocsSiteLoader
			
 
				+
			
 
				+    return DocsSiteLoader()
			
 
				+
			
 
				+
			
 
				+@pytest.fixture
			
 
				+def mocked_responses():
			
 
				+    with responses.RequestsMock() as rsps:
			
 
				+        yield rsps