test_docs_site_loader.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. import pytest
  2. import responses
  3. from bs4 import BeautifulSoup
  4. @pytest.mark.parametrize(
  5. "ignored_tag",
  6. [
  7. "<nav>This is a navigation bar.</nav>",
  8. "<aside>This is an aside.</aside>",
  9. "<form>This is a form.</form>",
  10. "<header>This is a header.</header>",
  11. "<noscript>This is a noscript.</noscript>",
  12. "<svg>This is an SVG.</svg>",
  13. "<canvas>This is a canvas.</canvas>",
  14. "<footer>This is a footer.</footer>",
  15. "<script>This is a script.</script>",
  16. "<style>This is a style.</style>",
  17. ],
  18. ids=["nav", "aside", "form", "header", "noscript", "svg", "canvas", "footer", "script", "style"],
  19. )
  20. @pytest.mark.parametrize(
  21. "selectee",
  22. [
  23. """
  24. <article class="bd-article">
  25. <h2>Article Title</h2>
  26. <p>Article content goes here.</p>
  27. {ignored_tag}
  28. </article>
  29. """,
  30. """
  31. <article role="main">
  32. <h2>Main Article Title</h2>
  33. <p>Main article content goes here.</p>
  34. {ignored_tag}
  35. </article>
  36. """,
  37. """
  38. <div class="md-content">
  39. <h2>Markdown Content</h2>
  40. <p>Markdown content goes here.</p>
  41. {ignored_tag}
  42. </div>
  43. """,
  44. """
  45. <div role="main">
  46. <h2>Main Content</h2>
  47. <p>Main content goes here.</p>
  48. {ignored_tag}
  49. </div>
  50. """,
  51. """
  52. <div class="container">
  53. <h2>Container</h2>
  54. <p>Container content goes here.</p>
  55. {ignored_tag}
  56. </div>
  57. """,
  58. """
  59. <div class="section">
  60. <h2>Section</h2>
  61. <p>Section content goes here.</p>
  62. {ignored_tag}
  63. </div>
  64. """,
  65. """
  66. <article>
  67. <h2>Generic Article</h2>
  68. <p>Generic article content goes here.</p>
  69. {ignored_tag}
  70. </article>
  71. """,
  72. """
  73. <main>
  74. <h2>Main Content</h2>
  75. <p>Main content goes here.</p>
  76. {ignored_tag}
  77. </main>
  78. """,
  79. ],
  80. ids=[
  81. "article.bd-article",
  82. 'article[role="main"]',
  83. "div.md-content",
  84. 'div[role="main"]',
  85. "div.container",
  86. "div.section",
  87. "article",
  88. "main",
  89. ],
  90. )
  91. def test_load_data_gets_by_selectors_and_ignored_tags(selectee, ignored_tag, loader, mocked_responses, mocker):
  92. child_url = "https://docs.embedchain.ai/quickstart"
  93. selectee = selectee.format(ignored_tag=ignored_tag)
  94. html_body = """
  95. <!DOCTYPE html>
  96. <html lang="en">
  97. <body>
  98. {selectee}
  99. </body>
  100. </html>
  101. """
  102. html_body = html_body.format(selectee=selectee)
  103. mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
  104. url = "https://docs.embedchain.ai/"
  105. html_body = """
  106. <!DOCTYPE html>
  107. <html lang="en">
  108. <body>
  109. <li><a href="/quickstart">Quickstart</a></li>
  110. </body>
  111. </html>
  112. """
  113. mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
  114. mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
  115. doc_id = "mocked_hash"
  116. mock_sha256.return_value.hexdigest.return_value = doc_id
  117. result = loader.load_data(url)
  118. selector_soup = BeautifulSoup(selectee, "html.parser")
  119. expected_content = " ".join((selector_soup.select_one("h2").get_text(), selector_soup.select_one("p").get_text()))
  120. assert result["doc_id"] == doc_id
  121. assert result["data"] == [
  122. {
  123. "content": expected_content,
  124. "meta_data": {"url": "https://docs.embedchain.ai/quickstart"},
  125. }
  126. ]
  127. def test_load_data_gets_child_links_recursively(loader, mocked_responses, mocker):
  128. child_url = "https://docs.embedchain.ai/quickstart"
  129. html_body = """
  130. <!DOCTYPE html>
  131. <html lang="en">
  132. <body>
  133. <li><a href="/">..</a></li>
  134. <li><a href="/quickstart">.</a></li>
  135. </body>
  136. </html>
  137. """
  138. mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
  139. child_url = "https://docs.embedchain.ai/introduction"
  140. html_body = """
  141. <!DOCTYPE html>
  142. <html lang="en">
  143. <body>
  144. <li><a href="/">..</a></li>
  145. <li><a href="/introduction">.</a></li>
  146. </body>
  147. </html>
  148. """
  149. mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html")
  150. url = "https://docs.embedchain.ai/"
  151. html_body = """
  152. <!DOCTYPE html>
  153. <html lang="en">
  154. <body>
  155. <li><a href="/quickstart">Quickstart</a></li>
  156. <li><a href="/introduction">Introduction</a></li>
  157. </body>
  158. </html>
  159. """
  160. mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
  161. mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
  162. doc_id = "mocked_hash"
  163. mock_sha256.return_value.hexdigest.return_value = doc_id
  164. result = loader.load_data(url)
  165. assert result["doc_id"] == doc_id
  166. expected_data = [
  167. {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}},
  168. {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/introduction"}},
  169. ]
  170. assert all(item in expected_data for item in result["data"])
  171. def test_load_data_fails_to_fetch_website(loader, mocked_responses, mocker):
  172. child_url = "https://docs.embedchain.ai/introduction"
  173. mocked_responses.get(child_url, status=404)
  174. url = "https://docs.embedchain.ai/"
  175. html_body = """
  176. <!DOCTYPE html>
  177. <html lang="en">
  178. <body>
  179. <li><a href="/introduction">Introduction</a></li>
  180. </body>
  181. </html>
  182. """
  183. mocked_responses.get(url, body=html_body, status=200, content_type="text/html")
  184. mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256")
  185. doc_id = "mocked_hash"
  186. mock_sha256.return_value.hexdigest.return_value = doc_id
  187. result = loader.load_data(url)
  188. assert result["doc_id"] is doc_id
  189. assert result["data"] == []
  190. @pytest.fixture
  191. def loader():
  192. from embedchain.loaders.docs_site_loader import DocsSiteLoader
  193. return DocsSiteLoader()
  194. @pytest.fixture
  195. def mocked_responses():
  196. with responses.RequestsMock() as rsps:
  197. yield rsps