BaseChunker.ts 1.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. import { createHash } from 'crypto';
  2. import type { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
  3. import type { BaseLoader } from '../loaders';
  4. import type { Input, LoaderResult } from '../models';
  5. import type { ChunkResult } from '../models/ChunkResult';
  6. class BaseChunker {
  7. textSplitter: RecursiveCharacterTextSplitter;
  8. constructor(textSplitter: RecursiveCharacterTextSplitter) {
  9. this.textSplitter = textSplitter;
  10. }
  11. async createChunks(loader: BaseLoader, url: Input): Promise<ChunkResult> {
  12. const documents: ChunkResult['documents'] = [];
  13. const ids: ChunkResult['ids'] = [];
  14. const datas: LoaderResult = await loader.loadData(url);
  15. const metadatas: ChunkResult['metadatas'] = [];
  16. const dataPromises = datas.map(async (data) => {
  17. const { content, metaData } = data;
  18. const chunks: string[] = await this.textSplitter.splitText(content);
  19. chunks.forEach((chunk) => {
  20. const chunkId = createHash('sha256')
  21. .update(chunk + metaData.url)
  22. .digest('hex');
  23. ids.push(chunkId);
  24. documents.push(chunk);
  25. metadatas.push(metaData);
  26. });
  27. });
  28. await Promise.all(dataPromises);
  29. return {
  30. documents,
  31. ids,
  32. metadatas,
  33. };
  34. }
  35. }
  36. export { BaseChunker };