test_chunkers.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. from embedchain.chunkers.docs_site import DocsSiteChunker
  2. from embedchain.chunkers.docx_file import DocxFileChunker
  3. from embedchain.chunkers.json import JSONChunker
  4. from embedchain.chunkers.mdx import MdxChunker
  5. from embedchain.chunkers.notion import NotionChunker
  6. from embedchain.chunkers.openapi import OpenAPIChunker
  7. from embedchain.chunkers.pdf_file import PdfFileChunker
  8. from embedchain.chunkers.qna_pair import QnaPairChunker
  9. from embedchain.chunkers.sitemap import SitemapChunker
  10. from embedchain.chunkers.table import TableChunker
  11. from embedchain.chunkers.text import TextChunker
  12. from embedchain.chunkers.web_page import WebPageChunker
  13. from embedchain.chunkers.xml import XmlChunker
  14. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  15. from embedchain.config.add_config import ChunkerConfig
  16. chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
  17. chunker_common_config = {
  18. DocsSiteChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  19. DocxFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  20. PdfFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  21. TextChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  22. MdxChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  23. NotionChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  24. QnaPairChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  25. TableChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  26. SitemapChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  27. WebPageChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  28. XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  29. YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  30. JSONChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  31. OpenAPIChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  32. }
  33. def test_default_config_values():
  34. for chunker_class, config in chunker_common_config.items():
  35. chunker = chunker_class()
  36. assert chunker.text_splitter._chunk_size == config["chunk_size"]
  37. assert chunker.text_splitter._chunk_overlap == config["chunk_overlap"]
  38. assert chunker.text_splitter._length_function == config["length_function"]
  39. def test_custom_config_values():
  40. for chunker_class, _ in chunker_common_config.items():
  41. chunker = chunker_class(config=chunker_config)
  42. assert chunker.text_splitter._chunk_size == 500
  43. assert chunker.text_splitter._chunk_overlap == 0
  44. assert chunker.text_splitter._length_function == len