test_text.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. # ruff: noqa: E501
  2. import unittest
  3. from embedchain.chunkers.text import TextChunker
  4. from embedchain.config import ChunkerConfig
  5. from embedchain.models.data_type import DataType
  6. class TestTextChunker(unittest.TestCase):
  7. def test_chunks(self):
  8. """
  9. Test the chunks generated by TextChunker.
  10. # TODO: Not a very precise test.
  11. """
  12. chunker_config = ChunkerConfig(chunk_size=10, chunk_overlap=0, length_function=len)
  13. chunker = TextChunker(config=chunker_config)
  14. text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
  15. # Data type must be set manually in the test
  16. chunker.set_data_type(DataType.TEXT)
  17. result = chunker.create_chunks(MockLoader(), text)
  18. documents = result["documents"]
  19. self.assertGreaterEqual(len(documents), 5)
  20. # Additional test cases can be added to cover different scenarios
  21. def test_big_chunksize(self):
  22. """
  23. Test that if an infinitely high chunk size is used, only one chunk is returned.
  24. """
  25. chunker_config = ChunkerConfig(chunk_size=9999999999, chunk_overlap=0, length_function=len)
  26. chunker = TextChunker(config=chunker_config)
  27. text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
  28. # Data type must be set manually in the test
  29. chunker.set_data_type(DataType.TEXT)
  30. result = chunker.create_chunks(MockLoader(), text)
  31. documents = result["documents"]
  32. self.assertEqual(len(documents), 1)
  33. def test_small_chunksize(self):
  34. """
  35. Test that if a chunk size of one is used, every character is a chunk.
  36. """
  37. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len)
  38. chunker = TextChunker(config=chunker_config)
  39. # We can't test with lorem ipsum because chunks are deduped, so would be recurring characters.
  40. text = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"""
  41. # Data type must be set manually in the test
  42. chunker.set_data_type(DataType.TEXT)
  43. result = chunker.create_chunks(MockLoader(), text)
  44. documents = result["documents"]
  45. print(documents)
  46. self.assertEqual(len(documents), len(text))
  47. class MockLoader:
  48. def load_data(self, src):
  49. """
  50. Mock loader that returns a list of data dictionaries.
  51. Adjust this method to return different data for testing.
  52. """
  53. return {
  54. "doc_id": "123",
  55. "data": [
  56. {
  57. "content": src,
  58. "meta_data": {"url": "none"},
  59. }
  60. ],
  61. }