// 确保类定义在全局作用域 export default class PageAnalyzer { constructor() { this.readability = null; } /** * 分析页面内容 * @returns {Object} 页面分析结果 */ analyzePage(iframe = false) { try { // 检查Readability是否可用 if (typeof Readability === "undefined") { console.warn( "Readability not loaded, falling back to basic extraction" ); return this.fallbackAnalysis(iframe); } // 创建文档副本以避免修改原始DOM const documentClone = document.cloneNode(true); // 初始化 Readability this.readability = new Readability(documentClone, { debug: false, charThreshold: 20, }); // 解析页面 const article = this.readability.parse(); return { title: article.title || document.title, url: window.location.href, mainContent: article.textContent || article.excerpt || "", excerpt: article.excerpt || "", siteName: article.siteName || new URL(window.location.href).hostname, wordCount: article.length || 0, }; } catch (error) { console.warn("Readability failed, using fallback:", error); return this.fallbackAnalysis(); } } // 基础提取方法作为后备 fallbackAnalysis(iframe) { return { title: document.title, url: window.location.href, mainContent: this.extractMainContent(iframe), excerpt: "", siteName: new URL(window.location.href).hostname, wordCount: 0, }; } // 基础的内容提取方法 extractMainContent(iframe) { // 移除脚本、样式等 //提取页面的表单 if (iframe) { const form = document.querySelector("form"); if (form) { content.textContent = form.outerHTML; } } else { const content = document.body.cloneNode(true); cleanPage(content) // .trim().replace(/\s+/g, " ") // content // .querySelectorAll("script, style, iframe, nav, header, footer,svg") // .forEach((el) => el.remove()); const regex = /\s+/g; // 定义标准 HTML 属性集合 排除id class style href src target const standardAttributes = new Set(['alt', 'title', 'type', 'value', 'name', 'placeholder', 'disabled', 'checked', 'selected', 'readonly', 'required', 'maxlength', 'min', 'max', 'step', 'pattern', 'autocomplete', 'autofocus', 'multiple', 'rows', 'cols', 'rel', 'aria-*']); // 创建一个临时容器 const temp = document.createElement('div'); temp.innerHTML = content.outerHTML; // 递归删除空标签 function removeEmpty(element) { const children = Array.from(element.children); children.forEach(child => { removeEmpty(child); // 递归处理子元素 }); // 检查标签是否为空 if (!element.innerHTML.trim()) { element.remove(); // 删除空标签 } } // 从临时容器的子元素开始处理 removeEmpty(temp); // 删除 标签 const linkTags = temp.querySelectorAll('link'); linkTags.forEach(link => { link.remove(); }); // 遍历所有元素 const elements = temp.querySelectorAll('*'); elements.forEach(element => { // 获取所有属性 const attributes = Array.from(element.attributes); attributes.forEach(attr => { // 如果属性不是标准属性,则移除 if (!standardAttributes.has(attr.name) && !attr.name.startsWith('aria-')) { element.removeAttribute(attr.name); } }); }); // 获取处理后的 HTML 字符串 const cleanedHtml = temp.innerHTML.trim().replace(regex, " "); // 销毁临时容器 temp.remove(); console.log(cleanedHtml) // content.outerHTML.trim().replace(/\s+/g, " "); return cleanedHtml; } } }; function cleanPage(body) { // 移除所有行内样式 const elementsWithInlineStyle = body.querySelectorAll('[style]'); elementsWithInlineStyle.forEach(element => { element.removeAttribute('style'); }); // 移除所有注释节点 const comments = []; const walk = document.createTreeWalker(body, NodeFilter.SHOW_COMMENT, null, false); while (walk.nextNode()) { comments.push(walk.currentNode); } comments.forEach(comment => comment.remove()); // 移除所有SVG图标 body.querySelectorAll("script, style, iframe, nav, header, footer,svg").forEach(svg => svg.remove()); // 移除所有图片的src属性 const images = body.querySelectorAll('img'); images.forEach(img => { img.removeAttribute('src'); }); // 可选择移除其他无关内容,比如脚本和广告等 // const scripts = body.querySelectorAll('script'); // scripts.forEach(script => script.remove()); // const iframes = body.querySelectorAll('iframe'); // iframes.forEach(iframe => iframe.remove()); const ads = body.querySelectorAll('.ad, .advertisement, .ads'); ads.forEach(ad => ad.remove()); console.log('页面清理完成!'); }