WebPage.ts 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. import axios from 'axios';
  2. import { JSDOM } from 'jsdom';
  3. import { cleanString } from '../utils';
  4. import { BaseLoader } from './BaseLoader';
  5. class WebPageLoader extends BaseLoader {
  6. // eslint-disable-next-line class-methods-use-this
  7. async loadData(url: string) {
  8. const response = await axios.get(url);
  9. const html = response.data;
  10. const dom = new JSDOM(html);
  11. const { document } = dom.window;
  12. const unwantedTags = [
  13. 'nav',
  14. 'aside',
  15. 'form',
  16. 'header',
  17. 'noscript',
  18. 'svg',
  19. 'canvas',
  20. 'footer',
  21. 'script',
  22. 'style',
  23. ];
  24. unwantedTags.forEach((tagName) => {
  25. const elements = document.getElementsByTagName(tagName);
  26. Array.from(elements).forEach((element) => {
  27. // eslint-disable-next-line no-param-reassign
  28. (element as HTMLElement).textContent = ' ';
  29. });
  30. });
  31. const output = [];
  32. let content = document.body.textContent;
  33. if (!content) {
  34. throw new Error('Web page content is empty.');
  35. }
  36. content = cleanString(content);
  37. const metaData = {
  38. url,
  39. };
  40. output.push({
  41. content,
  42. metaData,
  43. });
  44. return output;
  45. }
  46. }
  47. export { WebPageLoader };