doc_file.py 421 B

12345678910111213141516
  1. from embedchain.chunkers.base_chunker import BaseChunker
  2. from langchain.text_splitter import RecursiveCharacterTextSplitter
  3. TEXT_SPLITTER_CHUNK_PARAMS = {
  4. "chunk_size": 500,
  5. "chunk_overlap": 0,
  6. "length_function": len,
  7. }
  8. class DocFileChunker(BaseChunker):
  9. def __init__(self):
  10. text_splitter = RecursiveCharacterTextSplitter(**TEXT_SPLITTER_CHUNK_PARAMS)
  11. super().__init__(text_splitter)