test_chunkers.py 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. from embedchain.chunkers.discourse import DiscourseChunker
  2. from embedchain.chunkers.docs_site import DocsSiteChunker
  3. from embedchain.chunkers.docx_file import DocxFileChunker
  4. from embedchain.chunkers.gmail import GmailChunker
  5. from embedchain.chunkers.json import JSONChunker
  6. from embedchain.chunkers.mdx import MdxChunker
  7. from embedchain.chunkers.notion import NotionChunker
  8. from embedchain.chunkers.openapi import OpenAPIChunker
  9. from embedchain.chunkers.pdf_file import PdfFileChunker
  10. from embedchain.chunkers.postgres import PostgresChunker
  11. from embedchain.chunkers.qna_pair import QnaPairChunker
  12. from embedchain.chunkers.sitemap import SitemapChunker
  13. from embedchain.chunkers.slack import SlackChunker
  14. from embedchain.chunkers.table import TableChunker
  15. from embedchain.chunkers.text import TextChunker
  16. from embedchain.chunkers.web_page import WebPageChunker
  17. from embedchain.chunkers.xml import XmlChunker
  18. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  19. from embedchain.config.add_config import ChunkerConfig
  20. chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
  21. chunker_common_config = {
  22. DocsSiteChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  23. DocxFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  24. PdfFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  25. TextChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  26. MdxChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  27. NotionChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  28. QnaPairChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  29. TableChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  30. SitemapChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  31. WebPageChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  32. XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  33. YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  34. JSONChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  35. OpenAPIChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  36. GmailChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  37. PostgresChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  38. SlackChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  39. DiscourseChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  40. }
  41. def test_default_config_values():
  42. for chunker_class, config in chunker_common_config.items():
  43. chunker = chunker_class()
  44. assert chunker.text_splitter._chunk_size == config["chunk_size"]
  45. assert chunker.text_splitter._chunk_overlap == config["chunk_overlap"]
  46. assert chunker.text_splitter._length_function == config["length_function"]
  47. def test_custom_config_values():
  48. for chunker_class, _ in chunker_common_config.items():
  49. chunker = chunker_class(config=chunker_config)
  50. assert chunker.text_splitter._chunk_size == 500
  51. assert chunker.text_splitter._chunk_overlap == 0
  52. assert chunker.text_splitter._length_function == len