test_chunkers.py 3.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. from embedchain.chunkers.common_chunker import CommonChunker
  2. from embedchain.chunkers.discourse import DiscourseChunker
  3. from embedchain.chunkers.docs_site import DocsSiteChunker
  4. from embedchain.chunkers.docx_file import DocxFileChunker
  5. from embedchain.chunkers.gmail import GmailChunker
  6. from embedchain.chunkers.google_drive import GoogleDriveChunker
  7. from embedchain.chunkers.json import JSONChunker
  8. from embedchain.chunkers.mdx import MdxChunker
  9. from embedchain.chunkers.notion import NotionChunker
  10. from embedchain.chunkers.openapi import OpenAPIChunker
  11. from embedchain.chunkers.pdf_file import PdfFileChunker
  12. from embedchain.chunkers.postgres import PostgresChunker
  13. from embedchain.chunkers.qna_pair import QnaPairChunker
  14. from embedchain.chunkers.sitemap import SitemapChunker
  15. from embedchain.chunkers.slack import SlackChunker
  16. from embedchain.chunkers.table import TableChunker
  17. from embedchain.chunkers.text import TextChunker
  18. from embedchain.chunkers.web_page import WebPageChunker
  19. from embedchain.chunkers.xml import XmlChunker
  20. from embedchain.chunkers.youtube_video import YoutubeVideoChunker
  21. from embedchain.config.add_config import ChunkerConfig
  22. chunker_config = ChunkerConfig(chunk_size=500, chunk_overlap=0, length_function=len)
  23. chunker_common_config = {
  24. DocsSiteChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  25. DocxFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  26. PdfFileChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  27. TextChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  28. MdxChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  29. NotionChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  30. QnaPairChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  31. TableChunker: {"chunk_size": 300, "chunk_overlap": 0, "length_function": len},
  32. SitemapChunker: {"chunk_size": 500, "chunk_overlap": 0, "length_function": len},
  33. WebPageChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  34. XmlChunker: {"chunk_size": 500, "chunk_overlap": 50, "length_function": len},
  35. YoutubeVideoChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  36. JSONChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  37. OpenAPIChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  38. GmailChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  39. PostgresChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  40. SlackChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  41. DiscourseChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  42. CommonChunker: {"chunk_size": 2000, "chunk_overlap": 0, "length_function": len},
  43. GoogleDriveChunker: {"chunk_size": 1000, "chunk_overlap": 0, "length_function": len},
  44. }
  45. def test_default_config_values():
  46. for chunker_class, config in chunker_common_config.items():
  47. chunker = chunker_class()
  48. assert chunker.text_splitter._chunk_size == config["chunk_size"]
  49. assert chunker.text_splitter._chunk_overlap == config["chunk_overlap"]
  50. assert chunker.text_splitter._length_function == config["length_function"]
  51. def test_custom_config_values():
  52. for chunker_class, _ in chunker_common_config.items():
  53. chunker = chunker_class(config=chunker_config)
  54. assert chunker.text_splitter._chunk_size == 500
  55. assert chunker.text_splitter._chunk_overlap == 0
  56. assert chunker.text_splitter._length_function == len