test_text.py 3.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. # ruff: noqa: E501
  2. from embedchain.chunkers.text import TextChunker
  3. from embedchain.config import ChunkerConfig
  4. from embedchain.models.data_type import DataType
  5. class TestTextChunker:
  6. def test_chunks_without_app_id(self):
  7. """
  8. Test the chunks generated by TextChunker.
  9. """
  10. chunker_config = ChunkerConfig(chunk_size=10, chunk_overlap=0, length_function=len, min_chunk_size=0)
  11. chunker = TextChunker(config=chunker_config)
  12. text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
  13. # Data type must be set manually in the test
  14. chunker.set_data_type(DataType.TEXT)
  15. result = chunker.create_chunks(MockLoader(), text, chunker_config)
  16. documents = result["documents"]
  17. assert len(documents) > 5
  18. def test_chunks_with_app_id(self):
  19. """
  20. Test the chunks generated by TextChunker with app_id
  21. """
  22. chunker_config = ChunkerConfig(chunk_size=10, chunk_overlap=0, length_function=len, min_chunk_size=0)
  23. chunker = TextChunker(config=chunker_config)
  24. text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
  25. chunker.set_data_type(DataType.TEXT)
  26. result = chunker.create_chunks(MockLoader(), text, chunker_config)
  27. documents = result["documents"]
  28. assert len(documents) > 5
  29. def test_big_chunksize(self):
  30. """
  31. Test that if an infinitely high chunk size is used, only one chunk is returned.
  32. """
  33. chunker_config = ChunkerConfig(chunk_size=9999999999, chunk_overlap=0, length_function=len, min_chunk_size=0)
  34. chunker = TextChunker(config=chunker_config)
  35. text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
  36. # Data type must be set manually in the test
  37. chunker.set_data_type(DataType.TEXT)
  38. result = chunker.create_chunks(MockLoader(), text, chunker_config)
  39. documents = result["documents"]
  40. assert len(documents) == 1
  41. def test_small_chunksize(self):
  42. """
  43. Test that if a chunk size of one is used, every character is a chunk.
  44. """
  45. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len, min_chunk_size=0)
  46. chunker = TextChunker(config=chunker_config)
  47. # We can't test with lorem ipsum because chunks are deduped, so would be recurring characters.
  48. text = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"""
  49. # Data type must be set manually in the test
  50. chunker.set_data_type(DataType.TEXT)
  51. result = chunker.create_chunks(MockLoader(), text, chunker_config)
  52. documents = result["documents"]
  53. assert len(documents) == len(text)
  54. def test_word_count(self):
  55. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len, min_chunk_size=0)
  56. chunker = TextChunker(config=chunker_config)
  57. chunker.set_data_type(DataType.TEXT)
  58. document = ["ab cd", "ef gh"]
  59. result = chunker.get_word_count(document)
  60. assert result == 4
  61. class MockLoader:
  62. def load_data(self, src):
  63. """
  64. Mock loader that returns a list of data dictionaries.
  65. Adjust this method to return different data for testing.
  66. """
  67. return {
  68. "doc_id": "123",
  69. "data": [
  70. {
  71. "content": src,
  72. "meta_data": {"url": "none"},
  73. }
  74. ],
  75. }