test_text.py 3.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. # ruff: noqa: E501
  2. from embedchain.chunkers.text import TextChunker
  3. from embedchain.config import ChunkerConfig
  4. from embedchain.models.data_type import DataType
  5. class TestTextChunker:
  6. def test_chunks_without_app_id(self):
  7. """
  8. Test the chunks generated by TextChunker.
  9. """
  10. chunker_config = ChunkerConfig(chunk_size=10, chunk_overlap=0, length_function=len, min_chunk_size=0)
  11. chunker = TextChunker(config=chunker_config)
  12. text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
  13. # Data type must be set manually in the test
  14. chunker.set_data_type(DataType.TEXT)
  15. result = chunker.create_chunks(MockLoader(), text, chunker_config)
  16. documents = result["documents"]
  17. assert len(documents) > 5
  18. def test_chunks_with_app_id(self):
  19. """
  20. Test the chunks generated by TextChunker with app_id
  21. """
  22. chunker_config = ChunkerConfig(chunk_size=10, chunk_overlap=0, length_function=len, min_chunk_size=0)
  23. chunker = TextChunker(config=chunker_config)
  24. text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
  25. chunker.set_data_type(DataType.TEXT)
  26. result = chunker.create_chunks(MockLoader(), text, chunker_config)
  27. documents = result["documents"]
  28. assert len(documents) > 5
  29. def test_big_chunksize(self):
  30. """
  31. Test that if an infinitely high chunk size is used, only one chunk is returned.
  32. """
  33. chunker_config = ChunkerConfig(chunk_size=9999999999, chunk_overlap=0, length_function=len, min_chunk_size=0)
  34. chunker = TextChunker(config=chunker_config)
  35. text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
  36. # Data type must be set manually in the test
  37. chunker.set_data_type(DataType.TEXT)
  38. result = chunker.create_chunks(MockLoader(), text, chunker_config)
  39. documents = result["documents"]
  40. assert len(documents) == 1
  41. def test_small_chunksize(self):
  42. """
  43. Test that if a chunk size of one is used, every character is a chunk.
  44. """
  45. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len, min_chunk_size=0)
  46. chunker = TextChunker(config=chunker_config)
  47. # We can't test with lorem ipsum because chunks are deduped, so would be recurring characters.
  48. text = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"""
  49. # Data type must be set manually in the test
  50. chunker.set_data_type(DataType.TEXT)
  51. result = chunker.create_chunks(MockLoader(), text, chunker_config)
  52. documents = result["documents"]
  53. assert len(documents) == len(text)
  54. def test_word_count(self):
  55. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len, min_chunk_size=0)
  56. chunker = TextChunker(config=chunker_config)
  57. chunker.set_data_type(DataType.TEXT)
  58. document = ["ab cd", "ef gh"]
  59. result = chunker.get_word_count(document)
  60. assert result == 4
  61. class MockLoader:
  62. @staticmethod
  63. def load_data(src) -> dict:
  64. """
  65. Mock loader that returns a list of data dictionaries.
  66. Adjust this method to return different data for testing.
  67. """
  68. return {
  69. "doc_id": "123",
  70. "data": [
  71. {
  72. "content": src,
  73. "meta_data": {"url": "none"},
  74. }
  75. ],
  76. }