test_chunkers.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. from embedchain.chunkers.docs_site import DocsSiteChunker
  2. from embedchain.chunkers.docx_file import DocxFileChunker
  3. from embedchain.chunkers.gmail import GmailChunker
  4. from embedchain.chunkers.json import JSONChunker
  5. from embedchain.chunkers.mdx import MdxChunker
  6. from embedchain.chunkers.notion import NotionChunker
  7. from embedchain.chunkers.openapi import OpenAPIChunker
  8. from embedchain.chunkers.pdf_file import PdfFileChunker
  9. from embedchain.chunkers.postgres import PostgresChunker
  10. from embedchain.chunkers.qna_pair import QnaPairChunker
  11. from embedchain.chunkers.sitemap import SitemapChunker
  12. from embedchain.chunkers.table import TableChunker
  13. from embedchain.chunkers.text import TextChunker
  14. from embedchain.chunkers.web_page import WebPageChunker
  15. from embedchain.chunkers.xml import XmlChunker
  16. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  17. from embedchain.config.add_config import ChunkerConfig
  18. chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
  19. chunker_common_config = {
  20. DocsSiteChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  21. DocxFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  22. PdfFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  23. TextChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  24. MdxChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  25. NotionChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  26. QnaPairChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  27. TableChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  28. SitemapChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  29. WebPageChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  30. XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  31. YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  32. JSONChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  33. OpenAPIChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  34. GmailChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  35. PostgresChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  36. }
  37. def test_default_config_values():
  38. for chunker_class, config in chunker_common_config.items():
  39. chunker = chunker_class()
  40. assert chunker.text_splitter._chunk_size == config["chunk_size"]
  41. assert chunker.text_splitter._chunk_overlap == config["chunk_overlap"]
  42. assert chunker.text_splitter._length_function == config["length_function"]
  43. def test_custom_config_values():
  44. for chunker_class, _ in chunker_common_config.items():
  45. chunker = chunker_class(config=chunker_config)
  46. assert chunker.text_splitter._chunk_size == 500
  47. assert chunker.text_splitter._chunk_overlap == 0
  48. assert chunker.text_splitter._length_function == len