test_image_chunker.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. import unittest
  2. from embedchain.chunkers.images import ImagesChunker
  3. from embedchain.config import ChunkerConfig
  4. from embedchain.models.data_type import DataType
  5. class TestImageChunker(unittest.TestCase):
  6. def test_chunks(self):
  7. """
  8. Test the chunks generated by TextChunker.
  9. # TODO: Not a very precise test.
  10. """
  11. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len)
  12. chunker = ImagesChunker(config=chunker_config)
  13. # Data type must be set manually in the test
  14. chunker.set_data_type(DataType.IMAGES)
  15. image_path = "./tmp/image.jpeg"
  16. result = chunker.create_chunks(MockLoader(), image_path)
  17. expected_chunks = {'doc_id': '123',
  18. 'documents': [image_path],
  19. 'embeddings': ['embedding'],
  20. 'ids': ['140bedbf9c3f6d56a9846d2ba7088798683f4da0c248231336e6a05679e4fdfe'],
  21. 'metadatas': [{'data_type': 'images', 'doc_id': '123', 'url': 'none'}]}
  22. self.assertEqual(expected_chunks, result)
  23. def test_chunks_with_default_config(self):
  24. """
  25. Test the chunks generated by ImageChunker with default config.
  26. """
  27. chunker = ImagesChunker()
  28. # Data type must be set manually in the test
  29. chunker.set_data_type(DataType.IMAGES)
  30. image_path = "./tmp/image.jpeg"
  31. result = chunker.create_chunks(MockLoader(), image_path)
  32. expected_chunks = {'doc_id': '123',
  33. 'documents': [image_path],
  34. 'embeddings': ['embedding'],
  35. 'ids': ['140bedbf9c3f6d56a9846d2ba7088798683f4da0c248231336e6a05679e4fdfe'],
  36. 'metadatas': [{'data_type': 'images', 'doc_id': '123', 'url': 'none'}]}
  37. self.assertEqual(expected_chunks, result)
  38. def test_word_count(self):
  39. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len)
  40. chunker = ImagesChunker(config=chunker_config)
  41. chunker.set_data_type(DataType.IMAGES)
  42. document = [["ab cd", "ef gh"], ["ij kl", "mn op"]]
  43. result = chunker.get_word_count(document)
  44. self.assertEqual(result, 1)
  45. class MockLoader:
  46. def load_data(self, src):
  47. """
  48. Mock loader that returns a list of data dictionaries.
  49. Adjust this method to return different data for testing.
  50. """
  51. return {
  52. "doc_id": "123",
  53. "data": [
  54. {
  55. "content": src,
  56. "embedding": "embedding",
  57. "meta_data": {"url": "none"},
  58. }
  59. ],
  60. }