test_chunkers.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. from embedchain.chunkers.docs_site import DocsSiteChunker
  2. from embedchain.chunkers.docx_file import DocxFileChunker
  3. from embedchain.chunkers.gmail import GmailChunker
  4. from embedchain.chunkers.json import JSONChunker
  5. from embedchain.chunkers.mdx import MdxChunker
  6. from embedchain.chunkers.notion import NotionChunker
  7. from embedchain.chunkers.openapi import OpenAPIChunker
  8. from embedchain.chunkers.pdf_file import PdfFileChunker
  9. from embedchain.chunkers.qna_pair import QnaPairChunker
  10. from embedchain.chunkers.sitemap import SitemapChunker
  11. from embedchain.chunkers.table import TableChunker
  12. from embedchain.chunkers.text import TextChunker
  13. from embedchain.chunkers.web_page import WebPageChunker
  14. from embedchain.chunkers.xml import XmlChunker
  15. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  16. from embedchain.config.add_config import ChunkerConfig
  17. chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
  18. chunker_common_config = {
  19. DocsSiteChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  20. DocxFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  21. PdfFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  22. TextChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  23. MdxChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  24. NotionChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  25. QnaPairChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  26. TableChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  27. SitemapChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  28. WebPageChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  29. XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  30. YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  31. JSONChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  32. OpenAPIChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  33. GmailChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  34. }
  35. def test_default_config_values():
  36. for chunker_class, config in chunker_common_config.items():
  37. chunker = chunker_class()
  38. assert chunker.text_splitter._chunk_size == config["chunk_size"]
  39. assert chunker.text_splitter._chunk_overlap == config["chunk_overlap"]
  40. assert chunker.text_splitter._length_function == config["length_function"]
  41. def test_custom_config_values():
  42. for chunker_class, _ in chunker_common_config.items():
  43. chunker = chunker_class(config=chunker_config)
  44. assert chunker.text_splitter._chunk_size == 500
  45. assert chunker.text_splitter._chunk_overlap == 0
  46. assert chunker.text_splitter._length_function == len