test_chunkers.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. from embedchain.chunkers.audio import AudioChunker
  2. from embedchain.chunkers.common_chunker import CommonChunker
  3. from embedchain.chunkers.discourse import DiscourseChunker
  4. from embedchain.chunkers.docs_site import DocsSiteChunker
  5. from embedchain.chunkers.docx_file import DocxFileChunker
  6. from embedchain.chunkers.excel_file import ExcelFileChunker
  7. from embedchain.chunkers.gmail import GmailChunker
  8. from embedchain.chunkers.google_drive import GoogleDriveChunker
  9. from embedchain.chunkers.json import JSONChunker
  10. from embedchain.chunkers.mdx import MdxChunker
  11. from embedchain.chunkers.notion import NotionChunker
  12. from embedchain.chunkers.openapi import OpenAPIChunker
  13. from embedchain.chunkers.pdf_file import PdfFileChunker
  14. from embedchain.chunkers.postgres import PostgresChunker
  15. from embedchain.chunkers.qna_pair import QnaPairChunker
  16. from embedchain.chunkers.sitemap import SitemapChunker
  17. from embedchain.chunkers.slack import SlackChunker
  18. from embedchain.chunkers.table import TableChunker
  19. from embedchain.chunkers.text import TextChunker
  20. from embedchain.chunkers.web_page import WebPageChunker
  21. from embedchain.chunkers.xml import XmlChunker
  22. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  23. from embedchain.config.add_config import ChunkerConfig
  24. chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
  25. chunker_common_config = {
  26. DocsSiteChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  27. DocxFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  28. PdfFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  29. TextChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  30. MdxChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  31. NotionChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  32. QnaPairChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  33. TableChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  34. SitemapChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  35. WebPageChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  36. XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  37. YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  38. JSONChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  39. OpenAPIChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  40. GmailChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  41. PostgresChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  42. SlackChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  43. DiscourseChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  44. CommonChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  45. GoogleDriveChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  46. ExcelFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  47. AudioChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  48. }
  49. def test_default_config_values():
  50. for chunker_class, config in chunker_common_config.items():
  51. chunker = chunker_class()
  52. assert chunker.text_splitter._chunk_size == config["chunk_size"]
  53. assert chunker.text_splitter._chunk_overlap == config["chunk_overlap"]
  54. assert chunker.text_splitter._length_function == config["length_function"]
  55. def test_custom_config_values():
  56. for chunker_class, _ in chunker_common_config.items():
  57. chunker = chunker_class(config=chunker_config)
  58. assert chunker.text_splitter._chunk_size == 500
  59. assert chunker.text_splitter._chunk_overlap == 0
  60. assert chunker.text_splitter._length_function == len