images.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. import hashlib
  2. import logging
  3. import os
  4. from embedchain.loaders.base_loader import BaseLoader
  5. class ImagesLoader(BaseLoader):
  6. def load_data(self, image_url):
  7. """
  8. Loads images from the supplied directory/file and applies CLIP model transformation to represent these images
  9. in vector form
  10. :param image_url: The URL from which the images are to be loaded
  11. """
  12. # load model and image preprocessing
  13. from embedchain.models.clip_processor import ClipProcessor
  14. model = ClipProcessor.load_model()
  15. if os.path.isfile(image_url):
  16. data = [ClipProcessor.get_image_features(image_url, model)]
  17. else:
  18. data = []
  19. for filename in os.listdir(image_url):
  20. filepath = os.path.join(image_url, filename)
  21. try:
  22. data.append(ClipProcessor.get_image_features(filepath, model))
  23. except Exception as e:
  24. # Log the file that was not loaded
  25. logging.exception("Failed to load the file {}. Exception {}".format(filepath, e))
  26. # Get the metadata like Size, Last Modified and Last Created timestamps
  27. image_path_metadata = [
  28. str(os.path.getsize(image_url)),
  29. str(os.path.getmtime(image_url)),
  30. str(os.path.getctime(image_url)),
  31. ]
  32. doc_id = hashlib.sha256((" ".join(image_path_metadata) + image_url).encode()).hexdigest()
  33. return {
  34. "doc_id": doc_id,
  35. "data": data,
  36. }