docs_site.py 639 B

12345678910111213141516171819202122
  1. from typing import Optional
  2. from langchain.text_splitter import RecursiveCharacterTextSplitter
  3. from embedchain.chunkers.base_chunker import BaseChunker
  4. from embedchain.config.AddConfig import ChunkerConfig
  5. TEXT_SPLITTER_CHUNK_PARAMS = {
  6. "chunk_size": 500,
  7. "chunk_overlap": 50,
  8. "length_function": len,
  9. }
  10. class DocsSiteChunker(BaseChunker):
  11. """Chunker for code docs site."""
  12. def __init__(self, config: Optional[ChunkerConfig] = None):
  13. if config is None:
  14. config = TEXT_SPLITTER_CHUNK_PARAMS
  15. text_splitter = RecursiveCharacterTextSplitter(**config)
  16. super().__init__(text_splitter)