test_web_page.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. import hashlib
  2. from unittest.mock import Mock, patch
  3. import pytest
  4. from embedchain.loaders.web_page import WebPageLoader
  5. @pytest.fixture
  6. def web_page_loader():
  7. return WebPageLoader()
  8. def test_load_data(web_page_loader):
  9. page_url = "https://example.com/page"
  10. mock_response = Mock()
  11. mock_response.status_code = 200
  12. mock_response.content = """
  13. <html>
  14. <head>
  15. <title>Test Page</title>
  16. </head>
  17. <body>
  18. <div id="content">
  19. <p>This is some test content.</p>
  20. </div>
  21. </body>
  22. </html>
  23. """
  24. with patch("embedchain.loaders.web_page.WebPageLoader._session.get", return_value=mock_response):
  25. result = web_page_loader.load_data(page_url)
  26. content = web_page_loader._get_clean_content(mock_response.content, page_url)
  27. expected_doc_id = hashlib.sha256((content + page_url).encode()).hexdigest()
  28. assert result["doc_id"] == expected_doc_id
  29. expected_data = [
  30. {
  31. "content": content,
  32. "meta_data": {
  33. "url": page_url,
  34. },
  35. }
  36. ]
  37. assert result["data"] == expected_data
  38. def test_get_clean_content_excludes_unnecessary_info(web_page_loader):
  39. mock_html = """
  40. <html>
  41. <head>
  42. <title>Sample HTML</title>
  43. <style>
  44. /* Stylesheet to be excluded */
  45. .elementor-location-header {
  46. background-color: #f0f0f0;
  47. }
  48. </style>
  49. </head>
  50. <body>
  51. <header id="header">Header Content</header>
  52. <nav class="nav">Nav Content</nav>
  53. <aside>Aside Content</aside>
  54. <form>Form Content</form>
  55. <main>Main Content</main>
  56. <footer class="footer">Footer Content</footer>
  57. <script>Some Script</script>
  58. <noscript>NoScript Content</noscript>
  59. <svg>SVG Content</svg>
  60. <canvas>Canvas Content</canvas>
  61. <div id="sidebar">Sidebar Content</div>
  62. <div id="main-navigation">Main Navigation Content</div>
  63. <div id="menu-main-menu">Menu Main Menu Content</div>
  64. <div class="header-sidebar-wrapper">Header Sidebar Wrapper Content</div>
  65. <div class="blog-sidebar-wrapper">Blog Sidebar Wrapper Content</div>
  66. <div class="related-posts">Related Posts Content</div>
  67. </body>
  68. </html>
  69. """
  70. tags_to_exclude = [
  71. "nav",
  72. "aside",
  73. "form",
  74. "header",
  75. "noscript",
  76. "svg",
  77. "canvas",
  78. "footer",
  79. "script",
  80. "style",
  81. ]
  82. ids_to_exclude = ["sidebar", "main-navigation", "menu-main-menu"]
  83. classes_to_exclude = [
  84. "elementor-location-header",
  85. "navbar-header",
  86. "nav",
  87. "header-sidebar-wrapper",
  88. "blog-sidebar-wrapper",
  89. "related-posts",
  90. ]
  91. content = web_page_loader._get_clean_content(mock_html, "https://example.com/page")
  92. for tag in tags_to_exclude:
  93. assert tag not in content
  94. for id in ids_to_exclude:
  95. assert id not in content
  96. for class_name in classes_to_exclude:
  97. assert class_name not in content
  98. assert len(content) > 0