PdfFile.ts 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. import type { TextContent } from 'pdfjs-dist/types/src/display/api';
  2. import type { LoaderResult, Metadata } from '../models';
  3. import { cleanString } from '../utils';
  4. import { BaseLoader } from './BaseLoader';
  5. const pdfjsLib = require('pdfjs-dist');
  6. interface Page {
  7. page_content: string;
  8. }
  9. class PdfFileLoader extends BaseLoader {
  10. static async getPagesFromPdf(url: string): Promise<Page[]> {
  11. const loadingTask = pdfjsLib.getDocument(url);
  12. const pdf = await loadingTask.promise;
  13. const { numPages } = pdf;
  14. const promises = Array.from({ length: numPages }, async (_, i) => {
  15. const page = await pdf.getPage(i + 1);
  16. const pageText: TextContent = await page.getTextContent();
  17. const pageContent: string = pageText.items
  18. .map((item) => ('str' in item ? item.str : ''))
  19. .join(' ');
  20. return {
  21. page_content: pageContent,
  22. };
  23. });
  24. return Promise.all(promises);
  25. }
  26. // eslint-disable-next-line class-methods-use-this
  27. async loadData(url: string): Promise<LoaderResult> {
  28. const pages: Page[] = await PdfFileLoader.getPagesFromPdf(url);
  29. const output: LoaderResult = [];
  30. if (!pages.length) {
  31. throw new Error('No data found');
  32. }
  33. pages.forEach((page) => {
  34. let content: string = page.page_content;
  35. content = cleanString(content);
  36. const metaData: Metadata = {
  37. url,
  38. };
  39. output.push({
  40. content,
  41. metaData,
  42. });
  43. });
  44. return output;
  45. }
  46. }
  47. export { PdfFileLoader };