page-analyzer.js 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. // 确保类定义在全局作用域
  2. window.PageAnalyzer = class PageAnalyzer {
  3. constructor() {
  4. this.readability = null;
  5. }
  6. /**
  7. * 分析页面内容
  8. * @returns {Object} 页面分析结果
  9. */
  10. analyzePage() {
  11. try {
  12. // 检查Readability是否可用
  13. if (typeof Readability === "undefined") {
  14. console.warn(
  15. "Readability not loaded, falling back to basic extraction"
  16. );
  17. return this.fallbackAnalysis();
  18. }
  19. // 创建文档副本以避免修改原始DOM
  20. const documentClone = document.cloneNode(true);
  21. // 初始化 Readability
  22. this.readability = new Readability(documentClone, {
  23. debug: false,
  24. charThreshold: 20,
  25. });
  26. // 解析页面
  27. const article = this.readability.parse();
  28. return {
  29. title: article.title || document.title,
  30. url: window.location.href,
  31. mainContent: article.textContent || article.excerpt || "",
  32. excerpt: article.excerpt || "",
  33. siteName: article.siteName || new URL(window.location.href).hostname,
  34. wordCount: article.length || 0,
  35. };
  36. } catch (error) {
  37. console.warn("Readability failed, using fallback:", error);
  38. return this.fallbackAnalysis();
  39. }
  40. }
  41. // 基础提取方法作为后备
  42. fallbackAnalysis() {
  43. return {
  44. title: document.title,
  45. url: window.location.href,
  46. mainContent: this.extractMainContent(),
  47. excerpt: "",
  48. siteName: new URL(window.location.href).hostname,
  49. wordCount: 0,
  50. };
  51. }
  52. // 基础的内容提取方法
  53. extractMainContent() {
  54. // 移除脚本、样式等
  55. const content = document.body.cloneNode(true);
  56. // content
  57. // .querySelectorAll("script, style, iframe, nav, header, footer")
  58. // .forEach((el) => el.remove());
  59. //提取页面的表单
  60. const form = document.querySelector("form");
  61. if (form) {
  62. content.textContent = form.outerHTML;
  63. }
  64. return content.textContent.trim().replace(/\s+/g, " ");
  65. }
  66. };
  67. // 创建全局实例
  68. window.pageAnalyzer = new window.PageAnalyzer();