page-analyzer.js 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. // 确保类定义在全局作用域
  2. export default class PageAnalyzer {
  3. constructor() {
  4. this.readability = null;
  5. }
  6. /**
  7. * 分析页面内容
  8. * @returns {Object} 页面分析结果
  9. */
  10. analyzePage(iframe = false) {
  11. try {
  12. // 检查Readability是否可用
  13. if (typeof Readability === "undefined") {
  14. console.warn(
  15. "Readability not loaded, falling back to basic extraction"
  16. );
  17. return this.fallbackAnalysis(iframe);
  18. }
  19. // 创建文档副本以避免修改原始DOM
  20. const documentClone = document.cloneNode(true);
  21. // 初始化 Readability
  22. this.readability = new Readability(documentClone, {
  23. debug: false,
  24. charThreshold: 20,
  25. });
  26. // 解析页面
  27. const article = this.readability.parse();
  28. return {
  29. title: article.title || document.title,
  30. url: window.location.href,
  31. mainContent: article.textContent || article.excerpt || "",
  32. excerpt: article.excerpt || "",
  33. siteName: article.siteName || new URL(window.location.href).hostname,
  34. wordCount: article.length || 0,
  35. };
  36. } catch (error) {
  37. console.warn("Readability failed, using fallback:", error);
  38. return this.fallbackAnalysis();
  39. }
  40. }
  41. // 基础提取方法作为后备
  42. fallbackAnalysis(iframe) {
  43. return {
  44. title: document.title,
  45. url: window.location.href,
  46. mainContent: this.extractMainContent(iframe),
  47. excerpt: "",
  48. siteName: new URL(window.location.href).hostname,
  49. wordCount: 0,
  50. };
  51. }
  52. // 基础的内容提取方法
  53. extractMainContent(iframe) {
  54. // 移除脚本、样式等
  55. //提取页面的表单
  56. if (iframe) {
  57. const form = document.querySelector("form");
  58. if (form) {
  59. content.textContent = form.outerHTML;
  60. }
  61. } else {
  62. const content = document.body.cloneNode(true);
  63. cleanPage(content)
  64. // .trim().replace(/\s+/g, " ")
  65. // content
  66. // .querySelectorAll("script, style, iframe, nav, header, footer,svg")
  67. // .forEach((el) => el.remove());
  68. const regex = /\s+/g;
  69. // 定义标准 HTML 属性集合 排除id class style href src target
  70. const standardAttributes = new Set(['alt', 'title', 'type', 'value', 'name', 'placeholder', 'disabled', 'checked', 'selected', 'readonly', 'required', 'maxlength', 'min', 'max', 'step', 'pattern', 'autocomplete', 'autofocus', 'multiple', 'rows', 'cols', 'rel', 'aria-*']);
  71. // 创建一个临时容器
  72. const temp = document.createElement('div');
  73. temp.innerHTML = content.outerHTML;
  74. // 递归删除空标签
  75. function removeEmpty(element) {
  76. const children = Array.from(element.children);
  77. children.forEach(child => {
  78. removeEmpty(child); // 递归处理子元素
  79. });
  80. // 检查标签是否为空
  81. if (!element.innerHTML.trim()) {
  82. element.remove(); // 删除空标签
  83. }
  84. }
  85. // 从临时容器的子元素开始处理
  86. removeEmpty(temp);
  87. // 删除 <link> 标签
  88. const linkTags = temp.querySelectorAll('link');
  89. linkTags.forEach(link => {
  90. link.remove();
  91. });
  92. // 遍历所有元素
  93. const elements = temp.querySelectorAll('*');
  94. elements.forEach(element => {
  95. // 获取所有属性
  96. const attributes = Array.from(element.attributes);
  97. attributes.forEach(attr => {
  98. // 如果属性不是标准属性,则移除
  99. if (!standardAttributes.has(attr.name) && !attr.name.startsWith('aria-')) {
  100. element.removeAttribute(attr.name);
  101. }
  102. });
  103. });
  104. // 获取处理后的 HTML 字符串
  105. const cleanedHtml = temp.innerHTML.trim().replace(regex, " ");
  106. // 销毁临时容器
  107. temp.remove();
  108. console.log(cleanedHtml)
  109. // content.outerHTML.trim().replace(/\s+/g, " ");
  110. return cleanedHtml;
  111. }
  112. }
  113. };
  114. function cleanPage(body) {
  115. // 移除所有行内样式
  116. const elementsWithInlineStyle = body.querySelectorAll('[style]');
  117. elementsWithInlineStyle.forEach(element => {
  118. element.removeAttribute('style');
  119. });
  120. // 移除所有注释节点
  121. const comments = [];
  122. const walk = document.createTreeWalker(body, NodeFilter.SHOW_COMMENT, null, false);
  123. while (walk.nextNode()) {
  124. comments.push(walk.currentNode);
  125. }
  126. comments.forEach(comment => comment.remove());
  127. // 移除所有SVG图标
  128. body.querySelectorAll("script, style, iframe, nav, header, footer,svg").forEach(svg => svg.remove());
  129. // 移除所有图片的src属性
  130. const images = body.querySelectorAll('img');
  131. images.forEach(img => {
  132. img.removeAttribute('src');
  133. });
  134. // 可选择移除其他无关内容,比如脚本和广告等
  135. // const scripts = body.querySelectorAll('script');
  136. // scripts.forEach(script => script.remove());
  137. // const iframes = body.querySelectorAll('iframe');
  138. // iframes.forEach(iframe => iframe.remove());
  139. const ads = body.querySelectorAll('.ad, .advertisement, .ads');
  140. ads.forEach(ad => ad.remove());
  141. console.log('页面清理完成!');
  142. }