pdf_file.py 633 B

123456789101112131415161718192021
  1. from typing import Optional
  2. from embedchain.chunkers.base_chunker import BaseChunker
  3. from embedchain.config.AddConfig import ChunkerConfig
  4. from langchain.text_splitter import RecursiveCharacterTextSplitter
  5. TEXT_SPLITTER_CHUNK_PARAMS = {
  6. "chunk_size": 1000,
  7. "chunk_overlap": 0,
  8. "length_function": len,
  9. }
  10. class PdfFileChunker(BaseChunker):
  11. ''' Chunker for PDF file. '''
  12. def __init__(self, config: Optional[ChunkerConfig] = None):
  13. if config is None:
  14. config = TEXT_SPLITTER_CHUNK_PARAMS
  15. text_splitter = RecursiveCharacterTextSplitter(**config)
  16. super().__init__(text_splitter)