123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173 |
- // 确保类定义在全局作用域
- export default class PageAnalyzer {
- constructor() {
- this.readability = null;
- }
- /**
- * 分析页面内容
- * @returns {Object} 页面分析结果
- */
- analyzePage(iframe = false) {
- try {
- // 检查Readability是否可用
- if (typeof Readability === "undefined") {
- console.warn(
- "Readability not loaded, falling back to basic extraction"
- );
- return this.fallbackAnalysis(iframe);
- }
- // 创建文档副本以避免修改原始DOM
- const documentClone = document.cloneNode(true);
- // 初始化 Readability
- this.readability = new Readability(documentClone, {
- debug: false,
- charThreshold: 20,
- });
- // 解析页面
- const article = this.readability.parse();
- return {
- title: article.title || document.title,
- url: window.location.href,
- mainContent: article.textContent || article.excerpt || "",
- excerpt: article.excerpt || "",
- siteName: article.siteName || new URL(window.location.href).hostname,
- wordCount: article.length || 0,
- };
- } catch (error) {
- console.warn("Readability failed, using fallback:", error);
- return this.fallbackAnalysis();
- }
- }
- // 基础提取方法作为后备
- fallbackAnalysis(iframe) {
- return {
- title: document.title,
- url: window.location.href,
- mainContent: this.extractMainContent(iframe),
- excerpt: "",
- siteName: new URL(window.location.href).hostname,
- wordCount: 0,
- };
- }
- // 基础的内容提取方法
- extractMainContent(iframe) {
- // 移除脚本、样式等
- //提取页面的表单
- if (iframe) {
- const form = document.querySelector("form");
- if (form) {
- content.textContent = form.outerHTML;
- }
- } else {
- const content = document.body.cloneNode(true);
- cleanPage(content)
- // .trim().replace(/\s+/g, " ")
- // content
- // .querySelectorAll("script, style, iframe, nav, header, footer,svg")
- // .forEach((el) => el.remove());
- const regex = /\s+/g;
- // 定义标准 HTML 属性集合 排除id class style href src target
- const standardAttributes = new Set(['alt', 'title', 'type', 'value', 'name', 'placeholder', 'disabled', 'checked', 'selected', 'readonly', 'required', 'maxlength', 'min', 'max', 'step', 'pattern', 'autocomplete', 'autofocus', 'multiple', 'rows', 'cols', 'rel', 'aria-*']);
- // 创建一个临时容器
- const temp = document.createElement('div');
- temp.innerHTML = content.outerHTML;
- // 递归删除空标签
- function removeEmpty(element) {
- const children = Array.from(element.children);
- children.forEach(child => {
- removeEmpty(child); // 递归处理子元素
- });
- // 检查标签是否为空
- if (!element.innerHTML.trim()) {
- element.remove(); // 删除空标签
- }
- }
- // 从临时容器的子元素开始处理
- removeEmpty(temp);
- // 删除 <link> 标签
- const linkTags = temp.querySelectorAll('link');
- linkTags.forEach(link => {
- link.remove();
- });
- // 遍历所有元素
- const elements = temp.querySelectorAll('*');
- elements.forEach(element => {
- // 获取所有属性
- const attributes = Array.from(element.attributes);
- attributes.forEach(attr => {
- // 如果属性不是标准属性,则移除
- if (!standardAttributes.has(attr.name) && !attr.name.startsWith('aria-')) {
- element.removeAttribute(attr.name);
- }
- });
- });
- // 获取处理后的 HTML 字符串
- const cleanedHtml = temp.innerHTML.trim().replace(regex, " ");
- // 销毁临时容器
- temp.remove();
- console.log(cleanedHtml)
- // content.outerHTML.trim().replace(/\s+/g, " ");
- return cleanedHtml;
- }
- }
- };
- function cleanPage(body) {
- // 移除所有行内样式
- const elementsWithInlineStyle = body.querySelectorAll('[style]');
- elementsWithInlineStyle.forEach(element => {
- element.removeAttribute('style');
- });
- // 移除所有注释节点
- const comments = [];
- const walk = document.createTreeWalker(body, NodeFilter.SHOW_COMMENT, null, false);
- while (walk.nextNode()) {
- comments.push(walk.currentNode);
- }
- comments.forEach(comment => comment.remove());
- // 移除所有SVG图标
- body.querySelectorAll("script, style, iframe, nav, header, footer,svg").forEach(svg => svg.remove());
- // 移除所有图片的src属性
- const images = body.querySelectorAll('img');
- images.forEach(img => {
- img.removeAttribute('src');
- });
- // 可选择移除其他无关内容,比如脚本和广告等
- // const scripts = body.querySelectorAll('script');
- // scripts.forEach(script => script.remove());
- // const iframes = body.querySelectorAll('iframe');
- // iframes.forEach(iframe => iframe.remove());
- const ads = body.querySelectorAll('.ad, .advertisement, .ads');
- ads.forEach(ad => ad.remove());
- console.log('页面清理完成!');
- }
|