123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- import axios from 'axios';
- import { JSDOM } from 'jsdom';
- import { cleanString } from '../utils';
- import { BaseLoader } from './BaseLoader';
- class WebPageLoader extends BaseLoader {
- // eslint-disable-next-line class-methods-use-this
- async loadData(url: string) {
- const response = await axios.get(url);
- const html = response.data;
- const dom = new JSDOM(html);
- const { document } = dom.window;
- const unwantedTags = [
- 'nav',
- 'aside',
- 'form',
- 'header',
- 'noscript',
- 'svg',
- 'canvas',
- 'footer',
- 'script',
- 'style',
- ];
- unwantedTags.forEach((tagName) => {
- const elements = document.getElementsByTagName(tagName);
- Array.from(elements).forEach((element) => {
- // eslint-disable-next-line no-param-reassign
- (element as HTMLElement).textContent = ' ';
- });
- });
- const output = [];
- let content = document.body.textContent;
- if (!content) {
- throw new Error('Web page content is empty.');
- }
- content = cleanString(content);
- const metaData = {
- url,
- };
- output.push({
- content,
- metaData,
- });
- return output;
- }
- }
- export { WebPageLoader };
|