test_text.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. # ruff: noqa: E501
  2. import unittest
  3. from embedchain.chunkers.text import TextChunker
  4. from embedchain.config import ChunkerConfig
  5. class TestTextChunker(unittest.TestCase):
  6. def test_chunks(self):
  7. """
  8. Test the chunks generated by TextChunker.
  9. # TODO: Not a very precise test.
  10. """
  11. chunker_config = ChunkerConfig(chunk_size=10, chunk_overlap=0, length_function=len)
  12. chunker = TextChunker(config=chunker_config)
  13. text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
  14. result = chunker.create_chunks(MockLoader(), text)
  15. documents = result["documents"]
  16. self.assertGreaterEqual(len(documents), 5)
  17. # Additional test cases can be added to cover different scenarios
  18. def test_big_chunksize(self):
  19. """
  20. Test that if an infinitely high chunk size is used, only one chunk is returned.
  21. """
  22. chunker_config = ChunkerConfig(chunk_size=9999999999, chunk_overlap=0, length_function=len)
  23. chunker = TextChunker(config=chunker_config)
  24. text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
  25. result = chunker.create_chunks(MockLoader(), text)
  26. documents = result["documents"]
  27. self.assertEqual(len(documents), 1)
  28. def test_small_chunksize(self):
  29. """
  30. Test that if a chunk size of one is used, every character is a chunk.
  31. """
  32. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len)
  33. chunker = TextChunker(config=chunker_config)
  34. # We can't test with lorem ipsum because chunks are deduped, so would be recurring characters.
  35. text = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"""
  36. result = chunker.create_chunks(MockLoader(), text)
  37. documents = result["documents"]
  38. print(documents)
  39. self.assertEqual(len(documents), len(text))
  40. class MockLoader:
  41. def load_data(self, src):
  42. """
  43. Mock loader that returns a list of data dictionaries.
  44. Adjust this method to return different data for testing.
  45. """
  46. return [
  47. {
  48. "content": src,
  49. "meta_data": {"url": "none"},
  50. }
  51. ]