|
@@ -1,123 +1,173 @@
|
|
|
// 确保类定义在全局作用域
|
|
|
export default class PageAnalyzer {
|
|
|
- constructor() {
|
|
|
- this.readability = null;
|
|
|
+ constructor() {
|
|
|
+ this.readability = null;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 分析页面内容
|
|
|
+ * @returns {Object} 页面分析结果
|
|
|
+ */
|
|
|
+ analyzePage(iframe = false) {
|
|
|
+ try {
|
|
|
+ // 检查Readability是否可用
|
|
|
+ if (typeof Readability === "undefined") {
|
|
|
+ console.warn(
|
|
|
+ "Readability not loaded, falling back to basic extraction"
|
|
|
+ );
|
|
|
+ return this.fallbackAnalysis(iframe);
|
|
|
+ }
|
|
|
+
|
|
|
+ // 创建文档副本以避免修改原始DOM
|
|
|
+ const documentClone = document.cloneNode(true);
|
|
|
+
|
|
|
+ // 初始化 Readability
|
|
|
+ this.readability = new Readability(documentClone, {
|
|
|
+ debug: false,
|
|
|
+ charThreshold: 20,
|
|
|
+ });
|
|
|
+
|
|
|
+ // 解析页面
|
|
|
+ const article = this.readability.parse();
|
|
|
+
|
|
|
+ return {
|
|
|
+ title: article.title || document.title,
|
|
|
+ url: window.location.href,
|
|
|
+ mainContent: article.textContent || article.excerpt || "",
|
|
|
+ excerpt: article.excerpt || "",
|
|
|
+ siteName: article.siteName || new URL(window.location.href).hostname,
|
|
|
+ wordCount: article.length || 0,
|
|
|
+ };
|
|
|
+ } catch (error) {
|
|
|
+ console.warn("Readability failed, using fallback:", error);
|
|
|
+ return this.fallbackAnalysis();
|
|
|
}
|
|
|
-
|
|
|
- /**
|
|
|
- * 分析页面内容
|
|
|
- * @returns {Object} 页面分析结果
|
|
|
- */
|
|
|
- analyzePage(iframe = false) {
|
|
|
- try {
|
|
|
- // 检查Readability是否可用
|
|
|
- if (typeof Readability === "undefined") {
|
|
|
- console.warn(
|
|
|
- "Readability not loaded, falling back to basic extraction"
|
|
|
- );
|
|
|
- return this.fallbackAnalysis(iframe);
|
|
|
- }
|
|
|
-
|
|
|
- // 创建文档副本以避免修改原始DOM
|
|
|
- const documentClone = document.cloneNode(true);
|
|
|
-
|
|
|
- // 初始化 Readability
|
|
|
- this.readability = new Readability(documentClone, {
|
|
|
- debug: false,
|
|
|
- charThreshold: 20,
|
|
|
- });
|
|
|
-
|
|
|
- // 解析页面
|
|
|
- const article = this.readability.parse();
|
|
|
-
|
|
|
- return {
|
|
|
- title: article.title || document.title,
|
|
|
- url: window.location.href,
|
|
|
- mainContent: article.textContent || article.excerpt || "",
|
|
|
- excerpt: article.excerpt || "",
|
|
|
- siteName: article.siteName || new URL(window.location.href).hostname,
|
|
|
- wordCount: article.length || 0,
|
|
|
- };
|
|
|
- } catch (error) {
|
|
|
- console.warn("Readability failed, using fallback:", error);
|
|
|
- return this.fallbackAnalysis();
|
|
|
+ }
|
|
|
+
|
|
|
+ // 基础提取方法作为后备
|
|
|
+ fallbackAnalysis(iframe) {
|
|
|
+ return {
|
|
|
+ title: document.title,
|
|
|
+ url: window.location.href,
|
|
|
+ mainContent: this.extractMainContent(iframe),
|
|
|
+ excerpt: "",
|
|
|
+ siteName: new URL(window.location.href).hostname,
|
|
|
+ wordCount: 0,
|
|
|
+ };
|
|
|
+ }
|
|
|
+
|
|
|
+ // 基础的内容提取方法
|
|
|
+ extractMainContent(iframe) {
|
|
|
+ // 移除脚本、样式等
|
|
|
+
|
|
|
+ //提取页面的表单
|
|
|
+ if (iframe) {
|
|
|
+ const form = document.querySelector("form");
|
|
|
+ if (form) {
|
|
|
+ content.textContent = form.outerHTML;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ const content = document.body.cloneNode(true);
|
|
|
+ cleanPage(content)
|
|
|
+ // .trim().replace(/\s+/g, " ")
|
|
|
+ // content
|
|
|
+ // .querySelectorAll("script, style, iframe, nav, header, footer,svg")
|
|
|
+ // .forEach((el) => el.remove());
|
|
|
+
|
|
|
+ const regex = /\s+/g;
|
|
|
+
|
|
|
+ // 定义标准 HTML 属性集合 排除id class style href src target
|
|
|
+ const standardAttributes = new Set(['alt', 'title', 'type', 'value', 'name', 'placeholder', 'disabled', 'checked', 'selected', 'readonly', 'required', 'maxlength', 'min', 'max', 'step', 'pattern', 'autocomplete', 'autofocus', 'multiple', 'rows', 'cols', 'rel', 'aria-*']);
|
|
|
+
|
|
|
+ // 创建一个临时容器
|
|
|
+ const temp = document.createElement('div');
|
|
|
+ temp.innerHTML = content.outerHTML;
|
|
|
+ // 递归删除空标签
|
|
|
+ function removeEmpty(element) {
|
|
|
+ const children = Array.from(element.children);
|
|
|
+ children.forEach(child => {
|
|
|
+ removeEmpty(child); // 递归处理子元素
|
|
|
+ });
|
|
|
+
|
|
|
+ // 检查标签是否为空
|
|
|
+ if (!element.innerHTML.trim()) {
|
|
|
+ element.remove(); // 删除空标签
|
|
|
}
|
|
|
+ }
|
|
|
+
|
|
|
+ // 从临时容器的子元素开始处理
|
|
|
+ removeEmpty(temp);
|
|
|
+
|
|
|
+ // 删除 <link> 标签
|
|
|
+ const linkTags = temp.querySelectorAll('link');
|
|
|
+ linkTags.forEach(link => {
|
|
|
+ link.remove();
|
|
|
+ });
|
|
|
+
|
|
|
+ // 遍历所有元素
|
|
|
+ const elements = temp.querySelectorAll('*');
|
|
|
+ elements.forEach(element => {
|
|
|
+ // 获取所有属性
|
|
|
+ const attributes = Array.from(element.attributes);
|
|
|
+ attributes.forEach(attr => {
|
|
|
+ // 如果属性不是标准属性,则移除
|
|
|
+ if (!standardAttributes.has(attr.name) && !attr.name.startsWith('aria-')) {
|
|
|
+ element.removeAttribute(attr.name);
|
|
|
+ }
|
|
|
+ });
|
|
|
+ });
|
|
|
+
|
|
|
+ // 获取处理后的 HTML 字符串
|
|
|
+ const cleanedHtml = temp.innerHTML.trim().replace(regex, " ");
|
|
|
+
|
|
|
+ // 销毁临时容器
|
|
|
+ temp.remove();
|
|
|
+
|
|
|
+ console.log(cleanedHtml)
|
|
|
+ // content.outerHTML.trim().replace(/\s+/g, " ");
|
|
|
+ return cleanedHtml;
|
|
|
}
|
|
|
|
|
|
- // 基础提取方法作为后备
|
|
|
- fallbackAnalysis(iframe) {
|
|
|
- return {
|
|
|
- title: document.title,
|
|
|
- url: window.location.href,
|
|
|
- mainContent: this.extractMainContent(iframe),
|
|
|
- excerpt: "",
|
|
|
- siteName: new URL(window.location.href).hostname,
|
|
|
- wordCount: 0,
|
|
|
- };
|
|
|
- }
|
|
|
-
|
|
|
- // 基础的内容提取方法
|
|
|
- extractMainContent(iframe) {
|
|
|
- // 移除脚本、样式等
|
|
|
-
|
|
|
- //提取页面的表单
|
|
|
- if (iframe) {
|
|
|
- const form = document.querySelector("form");
|
|
|
- if (form) {
|
|
|
- content.textContent = form.outerHTML;
|
|
|
- }
|
|
|
- } else {
|
|
|
- const content = document.body.cloneNode(true);
|
|
|
- cleanPage(content)
|
|
|
-// .trim().replace(/\s+/g, " ")
|
|
|
- // content
|
|
|
- // .querySelectorAll("script, style, iframe, nav, header, footer,svg")
|
|
|
- // .forEach((el) => el.remove());
|
|
|
- console.log(content.outerHTML.trim().replace(/\s+/g, " "),56565);
|
|
|
-
|
|
|
- return content.outerHTML.trim().replace(/\s+/g, " ");
|
|
|
- }
|
|
|
-
|
|
|
|
|
|
- }
|
|
|
+ }
|
|
|
};
|
|
|
|
|
|
function cleanPage(body) {
|
|
|
|
|
|
- // 移除所有行内样式
|
|
|
- const elementsWithInlineStyle = body.querySelectorAll('[style]');
|
|
|
- elementsWithInlineStyle.forEach(element => {
|
|
|
- element.removeAttribute('style');
|
|
|
- });
|
|
|
-
|
|
|
- // 移除所有注释节点
|
|
|
- const comments = [];
|
|
|
- const walk = document.createTreeWalker(body, NodeFilter.SHOW_COMMENT, null, false);
|
|
|
- while (walk.nextNode()) {
|
|
|
- comments.push(walk.currentNode);
|
|
|
- }
|
|
|
- comments.forEach(comment => comment.remove());
|
|
|
+ // 移除所有行内样式
|
|
|
+ const elementsWithInlineStyle = body.querySelectorAll('[style]');
|
|
|
+ elementsWithInlineStyle.forEach(element => {
|
|
|
+ element.removeAttribute('style');
|
|
|
+ });
|
|
|
+
|
|
|
+ // 移除所有注释节点
|
|
|
+ const comments = [];
|
|
|
+ const walk = document.createTreeWalker(body, NodeFilter.SHOW_COMMENT, null, false);
|
|
|
+ while (walk.nextNode()) {
|
|
|
+ comments.push(walk.currentNode);
|
|
|
+ }
|
|
|
+ comments.forEach(comment => comment.remove());
|
|
|
|
|
|
- // 移除所有SVG图标
|
|
|
- body.querySelectorAll("script, style, iframe, nav, header, footer,svg").forEach(svg => svg.remove());
|
|
|
+ // 移除所有SVG图标
|
|
|
+ body.querySelectorAll("script, style, iframe, nav, header, footer,svg").forEach(svg => svg.remove());
|
|
|
|
|
|
- // 移除所有图片的src属性
|
|
|
- const images = body.querySelectorAll('img');
|
|
|
- images.forEach(img => {
|
|
|
- img.removeAttribute('src');
|
|
|
- });
|
|
|
+ // 移除所有图片的src属性
|
|
|
+ const images = body.querySelectorAll('img');
|
|
|
+ images.forEach(img => {
|
|
|
+ img.removeAttribute('src');
|
|
|
+ });
|
|
|
|
|
|
- // 可选择移除其他无关内容,比如脚本和广告等
|
|
|
- // const scripts = body.querySelectorAll('script');
|
|
|
- // scripts.forEach(script => script.remove());
|
|
|
+ // 可选择移除其他无关内容,比如脚本和广告等
|
|
|
+ // const scripts = body.querySelectorAll('script');
|
|
|
+ // scripts.forEach(script => script.remove());
|
|
|
|
|
|
- // const iframes = body.querySelectorAll('iframe');
|
|
|
- // iframes.forEach(iframe => iframe.remove());
|
|
|
+ // const iframes = body.querySelectorAll('iframe');
|
|
|
+ // iframes.forEach(iframe => iframe.remove());
|
|
|
|
|
|
- const ads = body.querySelectorAll('.ad, .advertisement, .ads');
|
|
|
- ads.forEach(ad => ad.remove());
|
|
|
+ const ads = body.querySelectorAll('.ad, .advertisement, .ads');
|
|
|
+ ads.forEach(ad => ad.remove());
|
|
|
|
|
|
- console.log('页面清理完成!');
|
|
|
+ console.log('页面清理完成!');
|
|
|
}
|
|
|
|
|
|
|