test_chunkers.py 3.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. from embedchain.chunkers.common_chunker import CommonChunker
  2. from embedchain.chunkers.discourse import DiscourseChunker
  3. from embedchain.chunkers.docs_site import DocsSiteChunker
  4. from embedchain.chunkers.docx_file import DocxFileChunker
  5. from embedchain.chunkers.excel_file import ExcelFileChunker
  6. from embedchain.chunkers.gmail import GmailChunker
  7. from embedchain.chunkers.google_drive import GoogleDriveChunker
  8. from embedchain.chunkers.json import JSONChunker
  9. from embedchain.chunkers.mdx import MdxChunker
  10. from embedchain.chunkers.notion import NotionChunker
  11. from embedchain.chunkers.openapi import OpenAPIChunker
  12. from embedchain.chunkers.pdf_file import PdfFileChunker
  13. from embedchain.chunkers.postgres import PostgresChunker
  14. from embedchain.chunkers.qna_pair import QnaPairChunker
  15. from embedchain.chunkers.sitemap import SitemapChunker
  16. from embedchain.chunkers.slack import SlackChunker
  17. from embedchain.chunkers.table import TableChunker
  18. from embedchain.chunkers.text import TextChunker
  19. from embedchain.chunkers.web_page import WebPageChunker
  20. from embedchain.chunkers.xml import XmlChunker
  21. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  22. from embedchain.config.add_config import ChunkerConfig
  23. chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
  24. chunker_common_config = {
  25. DocsSiteChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  26. DocxFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  27. PdfFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  28. TextChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  29. MdxChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  30. NotionChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  31. QnaPairChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  32. TableChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  33. SitemapChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  34. WebPageChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  35. XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  36. YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  37. JSONChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  38. OpenAPIChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  39. GmailChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  40. PostgresChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  41. SlackChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  42. DiscourseChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  43. CommonChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  44. GoogleDriveChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  45. ExcelFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  46. }
  47. def test_default_config_values():
  48. for chunker_class, config in chunker_common_config.items():
  49. chunker = chunker_class()
  50. assert chunker.text_splitter._chunk_size == config["chunk_size"]
  51. assert chunker.text_splitter._chunk_overlap == config["chunk_overlap"]
  52. assert chunker.text_splitter._length_function == config["length_function"]
  53. def test_custom_config_values():
  54. for chunker_class, _ in chunker_common_config.items():
  55. chunker = chunker_class(config=chunker_config)
  56. assert chunker.text_splitter._chunk_size == 500
  57. assert chunker.text_splitter._chunk_overlap == 0
  58. assert chunker.text_splitter._length_function == len