test_text.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. # ruff: noqa: E501
  2. import unittest
  3. from embedchain.chunkers.text import TextChunker
  4. from embedchain.config import ChunkerConfig
  5. from embedchain.models.data_type import DataType
  6. class TestTextChunker(unittest.TestCase):
  7. def test_chunks(self):
  8. """
  9. Test the chunks generated by TextChunker.
  10. # TODO: Not a very precise test.
  11. """
  12. chunker_config = ChunkerConfig(chunk_size=10, chunk_overlap=0, length_function=len)
  13. chunker = TextChunker(config=chunker_config)
  14. text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
  15. # Data type must be set manually in the test
  16. chunker.set_data_type(DataType.TEXT)
  17. result = chunker.create_chunks(MockLoader(), text)
  18. documents = result["documents"]
  19. self.assertGreaterEqual(len(documents), 5)
  20. # Additional test cases can be added to cover different scenarios
  21. def test_big_chunksize(self):
  22. """
  23. Test that if an infinitely high chunk size is used, only one chunk is returned.
  24. """
  25. chunker_config = ChunkerConfig(chunk_size=9999999999, chunk_overlap=0, length_function=len)
  26. chunker = TextChunker(config=chunker_config)
  27. text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
  28. # Data type must be set manually in the test
  29. chunker.set_data_type(DataType.TEXT)
  30. result = chunker.create_chunks(MockLoader(), text)
  31. documents = result["documents"]
  32. self.assertEqual(len(documents), 1)
  33. def test_small_chunksize(self):
  34. """
  35. Test that if a chunk size of one is used, every character is a chunk.
  36. """
  37. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len)
  38. chunker = TextChunker(config=chunker_config)
  39. # We can't test with lorem ipsum because chunks are deduped, so would be recurring characters.
  40. text = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"""
  41. # Data type must be set manually in the test
  42. chunker.set_data_type(DataType.TEXT)
  43. result = chunker.create_chunks(MockLoader(), text)
  44. documents = result["documents"]
  45. print(documents)
  46. self.assertEqual(len(documents), len(text))
  47. def test_word_count(self):
  48. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len)
  49. chunker = TextChunker(config=chunker_config)
  50. chunker.set_data_type(DataType.TEXT)
  51. document = ["ab cd", "ef gh"]
  52. result = chunker.get_word_count(document)
  53. self.assertEqual(result, 4)
  54. class MockLoader:
  55. def load_data(self, src):
  56. """
  57. Mock loader that returns a list of data dictionaries.
  58. Adjust this method to return different data for testing.
  59. """
  60. return {
  61. "doc_id": "123",
  62. "data": [
  63. {
  64. "content": src,
  65. "meta_data": {"url": "none"},
  66. }
  67. ],
  68. }