test_chunkers.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. from embedchain.chunkers.docs_site import DocsSiteChunker
  2. from embedchain.chunkers.docx_file import DocxFileChunker
  3. from embedchain.chunkers.mdx import MdxChunker
  4. from embedchain.chunkers.notion import NotionChunker
  5. from embedchain.chunkers.pdf_file import PdfFileChunker
  6. from embedchain.chunkers.qna_pair import QnaPairChunker
  7. from embedchain.chunkers.sitemap import SitemapChunker
  8. from embedchain.chunkers.table import TableChunker
  9. from embedchain.chunkers.text import TextChunker
  10. from embedchain.chunkers.web_page import WebPageChunker
  11. from embedchain.chunkers.xml import XmlChunker
  12. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  13. from embedchain.config.add_config import ChunkerConfig
  14. chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
  15. chunker_common_config = {
  16. DocsSiteChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  17. DocxFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  18. PdfFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  19. TextChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  20. MdxChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  21. NotionChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  22. QnaPairChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  23. TableChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  24. SitemapChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  25. WebPageChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  26. XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  27. YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  28. }
  29. def test_default_config_values():
  30. for chunker_class, config in chunker_common_config.items():
  31. chunker = chunker_class()
  32. assert chunker.text_splitter._chunk_size == config["chunk_size"]
  33. assert chunker.text_splitter._chunk_overlap == config["chunk_overlap"]
  34. assert chunker.text_splitter._length_function == config["length_function"]
  35. def test_custom_config_values():
  36. for chunker_class, _ in chunker_common_config.items():
  37. chunker = chunker_class(config=chunker_config)
  38. assert chunker.text_splitter._chunk_size == 500
  39. assert chunker.text_splitter._chunk_overlap == 0
  40. assert chunker.text_splitter._length_function == len