test_chunkers.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. from embedchain.chunkers.docs_site import DocsSiteChunker
  2. from embedchain.chunkers.docx_file import DocxFileChunker
  3. from embedchain.chunkers.json import JSONChunker
  4. from embedchain.chunkers.mdx import MdxChunker
  5. from embedchain.chunkers.notion import NotionChunker
  6. from embedchain.chunkers.pdf_file import PdfFileChunker
  7. from embedchain.chunkers.qna_pair import QnaPairChunker
  8. from embedchain.chunkers.sitemap import SitemapChunker
  9. from embedchain.chunkers.table import TableChunker
  10. from embedchain.chunkers.text import TextChunker
  11. from embedchain.chunkers.web_page import WebPageChunker
  12. from embedchain.chunkers.xml import XmlChunker
  13. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  14. from embedchain.config.add_config import ChunkerConfig
  15. chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
  16. chunker_common_config = {
  17. DocsSiteChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  18. DocxFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  19. PdfFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  20. TextChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  21. MdxChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  22. NotionChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  23. QnaPairChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  24. TableChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  25. SitemapChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  26. WebPageChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  27. XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  28. YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  29. JSONChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  30. }
  31. def test_default_config_values():
  32. for chunker_class, config in chunker_common_config.items():
  33. chunker = chunker_class()
  34. assert chunker.text_splitter._chunk_size == config["chunk_size"]
  35. assert chunker.text_splitter._chunk_overlap == config["chunk_overlap"]
  36. assert chunker.text_splitter._length_function == config["length_function"]
  37. def test_custom_config_values():
  38. for chunker_class, _ in chunker_common_config.items():
  39. chunker = chunker_class(config=chunker_config)
  40. assert chunker.text_splitter._chunk_size == 500
  41. assert chunker.text_splitter._chunk_overlap == 0
  42. assert chunker.text_splitter._length_function == len