docx_file.py 601 B

123456789101112131415161718192021
  1. from typing import Optional
  2. from embedchain.chunkers.base_chunker import BaseChunker
  3. from embedchain.config.AddConfig import ChunkerConfig
  4. from langchain.text_splitter import RecursiveCharacterTextSplitter
  5. TEXT_SPLITTER_CHUNK_PARAMS = {
  6. "chunk_size": 1000,
  7. "chunk_overlap": 0,
  8. "length_function": len,
  9. }
  10. class DocxFileChunker(BaseChunker):
  11. def __init__(self, config: Optional[ChunkerConfig] = None):
  12. if config is None:
  13. config = TEXT_SPLITTER_CHUNK_PARAMS
  14. text_splitter = RecursiveCharacterTextSplitter(**config)
  15. super().__init__(text_splitter)