import pytest import responses from bs4 import BeautifulSoup @pytest.mark.parametrize( "ignored_tag", [ "", "", "", "

This is a header.

", "", "", "

", "", "", "", ], ids=["nav", "aside", "form", "header", "noscript", "svg", "canvas", "footer", "script", "style"], ) @pytest.mark.parametrize( "selectee", [ """

Article Title

Article content goes here.

{ignored_tag}

""", """

Main Article Title

Main article content goes here.

{ignored_tag}

""", """

Markdown Content

Markdown content goes here.

{ignored_tag}

""", """

Main Content

Main content goes here.

{ignored_tag}

""", """

Container

Container content goes here.

{ignored_tag}

""", """

Section

Section content goes here.

{ignored_tag}

""", """

Generic Article

Generic article content goes here.

{ignored_tag}

""", """

Main Content

Main content goes here.

{ignored_tag}

""", ], ids=[ "article.bd-article", 'article[role="main"]', "div.md-content", 'div[role="main"]', "div.container", "div.section", "article", "main", ], ) def test_load_data_gets_by_selectors_and_ignored_tags(selectee, ignored_tag, loader, mocked_responses, mocker): child_url = "https://docs.embedchain.ai/quickstart" selectee = selectee.format(ignored_tag=ignored_tag) html_body = """ {selectee} """ html_body = html_body.format(selectee=selectee) mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html") url = "https://docs.embedchain.ai/" html_body = """

Quickstart

""" mocked_responses.get(url, body=html_body, status=200, content_type="text/html") mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256") doc_id = "mocked_hash" mock_sha256.return_value.hexdigest.return_value = doc_id result = loader.load_data(url) selector_soup = BeautifulSoup(selectee, "html.parser") expected_content = " ".join((selector_soup.select_one("h2").get_text(), selector_soup.select_one("p").get_text())) assert result["doc_id"] == doc_id assert result["data"] == [ { "content": expected_content, "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}, } ] def test_load_data_gets_child_links_recursively(loader, mocked_responses, mocker): child_url = "https://docs.embedchain.ai/quickstart" html_body = """

""" mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html") child_url = "https://docs.embedchain.ai/introduction" html_body = """

""" mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html") url = "https://docs.embedchain.ai/" html_body = """

Quickstart

Introduction

""" mocked_responses.get(url, body=html_body, status=200, content_type="text/html") mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256") doc_id = "mocked_hash" mock_sha256.return_value.hexdigest.return_value = doc_id result = loader.load_data(url) assert result["doc_id"] == doc_id expected_data = [ {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}}, {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/introduction"}}, ] assert all(item in expected_data for item in result["data"]) def test_load_data_fails_to_fetch_website(loader, mocked_responses, mocker): child_url = "https://docs.embedchain.ai/introduction" mocked_responses.get(child_url, status=404) url = "https://docs.embedchain.ai/" html_body = """

Introduction