Selaa lähdekoodia

test: added chunker unit tests (#325)

cachho 2 vuotta sitten
vanhempi
commit
fdf5d1928d
1 muutettua tiedostoa jossa 31 lisäystä ja 0 poistoa
  1. 31 0
      tests/chunkers/test_text.py

+ 31 - 0
tests/chunkers/test_text.py

@@ -24,6 +24,37 @@ class TestTextChunker(unittest.TestCase):
 
     # Additional test cases can be added to cover different scenarios
 
+    def test_big_chunksize(self):
+        """
+        Test that if an infinitely high chunk size is used, only one chunk is returned.
+        """
+        chunker_config = ChunkerConfig(chunk_size=9999999999, chunk_overlap=0, length_function=len)
+        chunker = TextChunker(config=chunker_config)
+        text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
+
+        result = chunker.create_chunks(MockLoader(), text)
+
+        documents = result["documents"]
+
+        self.assertEqual(len(documents), 1)
+
+    def test_small_chunksize(self):
+        """
+        Test that if a chunk size of one is used, every character is a chunk.
+        """
+        chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len)
+        chunker = TextChunker(config=chunker_config)
+        # We can't test with lorem ipsum because chunks are deduped, so would be recurring characters.
+        text = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"""
+
+        result = chunker.create_chunks(MockLoader(), text)
+
+        documents = result["documents"]
+
+        print(documents)
+
+        self.assertEqual(len(documents), len(text))
+
 
 class MockLoader:
     def load_data(self, src):