test_chunkers.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. from embedchain.chunkers.common_chunker import CommonChunker
  2. from embedchain.chunkers.discourse import DiscourseChunker
  3. from embedchain.chunkers.docs_site import DocsSiteChunker
  4. from embedchain.chunkers.docx_file import DocxFileChunker
  5. from embedchain.chunkers.gmail import GmailChunker
  6. from embedchain.chunkers.json import JSONChunker
  7. from embedchain.chunkers.mdx import MdxChunker
  8. from embedchain.chunkers.notion import NotionChunker
  9. from embedchain.chunkers.openapi import OpenAPIChunker
  10. from embedchain.chunkers.pdf_file import PdfFileChunker
  11. from embedchain.chunkers.postgres import PostgresChunker
  12. from embedchain.chunkers.qna_pair import QnaPairChunker
  13. from embedchain.chunkers.sitemap import SitemapChunker
  14. from embedchain.chunkers.slack import SlackChunker
  15. from embedchain.chunkers.table import TableChunker
  16. from embedchain.chunkers.text import TextChunker
  17. from embedchain.chunkers.web_page import WebPageChunker
  18. from embedchain.chunkers.xml import XmlChunker
  19. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  20. from embedchain.config.add_config import ChunkerConfig
  21. chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
  22. chunker_common_config = {
  23. DocsSiteChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  24. DocxFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  25. PdfFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  26. TextChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  27. MdxChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  28. NotionChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  29. QnaPairChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  30. TableChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  31. SitemapChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  32. WebPageChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  33. XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  34. YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  35. JSONChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  36. OpenAPIChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  37. GmailChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  38. PostgresChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  39. SlackChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  40. DiscourseChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  41. CommonChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  42. }
  43. def test_default_config_values():
  44. for chunker_class, config in chunker_common_config.items():
  45. chunker = chunker_class()
  46. assert chunker.text_splitter._chunk_size == config["chunk_size"]
  47. assert chunker.text_splitter._chunk_overlap == config["chunk_overlap"]
  48. assert chunker.text_splitter._length_function == config["length_function"]
  49. def test_custom_config_values():
  50. for chunker_class, _ in chunker_common_config.items():
  51. chunker = chunker_class(config=chunker_config)
  52. assert chunker.text_splitter._chunk_size == 500
  53. assert chunker.text_splitter._chunk_overlap == 0
  54. assert chunker.text_splitter._length_function == len