images.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. import hashlib
  2. from typing import Optional
  3. from langchain.text_splitter import RecursiveCharacterTextSplitter
  4. from embedchain.chunkers.base_chunker import BaseChunker
  5. from embedchain.config.add_config import ChunkerConfig
  6. class ImagesChunker(BaseChunker):
  7. """Chunker for an Image."""
  8. def __init__(self, config: Optional[ChunkerConfig] = None):
  9. if config is None:
  10. config = ChunkerConfig(chunk_size=300, chunk_overlap=0, length_function=len)
  11. image_splitter = RecursiveCharacterTextSplitter(
  12. chunk_size=config.chunk_size,
  13. chunk_overlap=config.chunk_overlap,
  14. length_function=config.length_function,
  15. )
  16. super().__init__(image_splitter)
  17. def create_chunks(self, loader, src, app_id=None):
  18. """
  19. Loads the image(s), and creates their corresponding embedding. This creates one chunk for each image
  20. :param loader: The loader whose `load_data` method is used to create
  21. the raw data.
  22. :param src: The data to be handled by the loader. Can be a URL for
  23. remote sources or local content for local loaders.
  24. """
  25. documents = []
  26. embeddings = []
  27. ids = []
  28. data_result = loader.load_data(src)
  29. data_records = data_result["data"]
  30. doc_id = data_result["doc_id"]
  31. doc_id = f"{app_id}--{doc_id}" if app_id is not None else doc_id
  32. metadatas = []
  33. for data in data_records:
  34. meta_data = data["meta_data"]
  35. # add data type to meta data to allow query using data type
  36. meta_data["data_type"] = self.data_type.value
  37. chunk_id = hashlib.sha256(meta_data["url"].encode()).hexdigest()
  38. ids.append(chunk_id)
  39. documents.append(data["content"])
  40. embeddings.append(data["embedding"])
  41. meta_data["doc_id"] = doc_id
  42. metadatas.append(meta_data)
  43. return {
  44. "documents": documents,
  45. "embeddings": embeddings,
  46. "ids": ids,
  47. "metadatas": metadatas,
  48. "doc_id": doc_id,
  49. }
  50. def get_word_count(self, documents):
  51. """
  52. The number of chunks and the corresponding word count for an image is fixed to 1, as 1 embedding is created for
  53. each image
  54. """
  55. return 1