test_image_chunker.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. import unittest
  2. from embedchain.chunkers.images import ImagesChunker
  3. from embedchain.config import ChunkerConfig
  4. from embedchain.models.data_type import DataType
  5. class TestImageChunker(unittest.TestCase):
  6. def test_chunks(self):
  7. """
  8. Test the chunks generated by TextChunker.
  9. # TODO: Not a very precise test.
  10. """
  11. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len, min_chunk_size=0)
  12. chunker = ImagesChunker(config=chunker_config)
  13. # Data type must be set manually in the test
  14. chunker.set_data_type(DataType.IMAGES)
  15. image_path = "./tmp/image.jpeg"
  16. app_id = "app1"
  17. result = chunker.create_chunks(MockLoader(), image_path, app_id=app_id)
  18. expected_chunks = {
  19. "doc_id": f"{app_id}--123",
  20. "documents": [image_path],
  21. "embeddings": ["embedding"],
  22. "ids": ["140bedbf9c3f6d56a9846d2ba7088798683f4da0c248231336e6a05679e4fdfe"],
  23. "metadatas": [{"data_type": "images", "doc_id": f"{app_id}--123", "url": "none"}],
  24. }
  25. self.assertEqual(expected_chunks, result)
  26. def test_chunks_with_default_config(self):
  27. """
  28. Test the chunks generated by ImageChunker with default config.
  29. """
  30. chunker = ImagesChunker()
  31. # Data type must be set manually in the test
  32. chunker.set_data_type(DataType.IMAGES)
  33. image_path = "./tmp/image.jpeg"
  34. app_id = "app1"
  35. result = chunker.create_chunks(MockLoader(), image_path, app_id=app_id)
  36. expected_chunks = {
  37. "doc_id": f"{app_id}--123",
  38. "documents": [image_path],
  39. "embeddings": ["embedding"],
  40. "ids": ["140bedbf9c3f6d56a9846d2ba7088798683f4da0c248231336e6a05679e4fdfe"],
  41. "metadatas": [{"data_type": "images", "doc_id": f"{app_id}--123", "url": "none"}],
  42. }
  43. self.assertEqual(expected_chunks, result)
  44. def test_word_count(self):
  45. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len, min_chunk_size=0)
  46. chunker = ImagesChunker(config=chunker_config)
  47. chunker.set_data_type(DataType.IMAGES)
  48. document = [["ab cd", "ef gh"], ["ij kl", "mn op"]]
  49. result = chunker.get_word_count(document)
  50. self.assertEqual(result, 1)
  51. class MockLoader:
  52. def load_data(self, src):
  53. """
  54. Mock loader that returns a list of data dictionaries.
  55. Adjust this method to return different data for testing.
  56. """
  57. return {
  58. "doc_id": "123",
  59. "data": [
  60. {
  61. "content": src,
  62. "embedding": "embedding",
  63. "meta_data": {"url": "none"},
  64. }
  65. ],
  66. }