|
@@ -5,18 +5,16 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
from embedchain.chunkers.base_chunker import BaseChunker
|
|
|
from embedchain.config.AddConfig import ChunkerConfig
|
|
|
|
|
|
-TEXT_SPLITTER_CHUNK_PARAMS = {
|
|
|
- "chunk_size": 500,
|
|
|
- "chunk_overlap": 50,
|
|
|
- "length_function": len,
|
|
|
-}
|
|
|
-
|
|
|
|
|
|
class DocsSiteChunker(BaseChunker):
|
|
|
"""Chunker for code docs site."""
|
|
|
|
|
|
def __init__(self, config: Optional[ChunkerConfig] = None):
|
|
|
if config is None:
|
|
|
- config = TEXT_SPLITTER_CHUNK_PARAMS
|
|
|
- text_splitter = RecursiveCharacterTextSplitter(**config)
|
|
|
+ config = ChunkerConfig(chunk_size=500, chunk_overlap=50, length_function=len)
|
|
|
+ text_splitter = RecursiveCharacterTextSplitter(
|
|
|
+ chunk_size=config.chunk_size,
|
|
|
+ chunk_overlap=config.chunk_overlap,
|
|
|
+ length_function=config.length_function,
|
|
|
+ )
|
|
|
super().__init__(text_splitter)
|