test_image_chunker.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. import unittest
  2. from embedchain.chunkers.images import ImagesChunker
  3. from embedchain.config import ChunkerConfig
  4. from embedchain.models.data_type import DataType
  5. class TestImageChunker(unittest.TestCase):
  6. def test_chunks(self):
  7. """
  8. Test the chunks generated by TextChunker.
  9. # TODO: Not a very precise test.
  10. """
  11. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len)
  12. chunker = ImagesChunker(config=chunker_config)
  13. # Data type must be set manually in the test
  14. chunker.set_data_type(DataType.IMAGES)
  15. image_path = "./tmp/image.jpeg"
  16. result = chunker.create_chunks(MockLoader(), image_path)
  17. expected_chunks = {
  18. "doc_id": "123",
  19. "documents": [image_path],
  20. "embeddings": ["embedding"],
  21. "ids": ["140bedbf9c3f6d56a9846d2ba7088798683f4da0c248231336e6a05679e4fdfe"],
  22. "metadatas": [{"data_type": "images", "doc_id": "123", "url": "none"}],
  23. }
  24. self.assertEqual(expected_chunks, result)
  25. def test_chunks_with_default_config(self):
  26. """
  27. Test the chunks generated by ImageChunker with default config.
  28. """
  29. chunker = ImagesChunker()
  30. # Data type must be set manually in the test
  31. chunker.set_data_type(DataType.IMAGES)
  32. image_path = "./tmp/image.jpeg"
  33. result = chunker.create_chunks(MockLoader(), image_path)
  34. expected_chunks = {
  35. "doc_id": "123",
  36. "documents": [image_path],
  37. "embeddings": ["embedding"],
  38. "ids": ["140bedbf9c3f6d56a9846d2ba7088798683f4da0c248231336e6a05679e4fdfe"],
  39. "metadatas": [{"data_type": "images", "doc_id": "123", "url": "none"}],
  40. }
  41. self.assertEqual(expected_chunks, result)
  42. def test_word_count(self):
  43. chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len)
  44. chunker = ImagesChunker(config=chunker_config)
  45. chunker.set_data_type(DataType.IMAGES)
  46. document = [["ab cd", "ef gh"], ["ij kl", "mn op"]]
  47. result = chunker.get_word_count(document)
  48. self.assertEqual(result, 1)
  49. class MockLoader:
  50. def load_data(self, src):
  51. """
  52. Mock loader that returns a list of data dictionaries.
  53. Adjust this method to return different data for testing.
  54. """
  55. return {
  56. "doc_id": "123",
  57. "data": [
  58. {
  59. "content": src,
  60. "embedding": "embedding",
  61. "meta_data": {"url": "none"},
  62. }
  63. ],
  64. }