dropbox.py 3.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. import hashlib
  2. import os
  3. from typing import List
  4. from dropbox.files import FileMetadata
  5. from embedchain.helpers.json_serializable import register_deserializable
  6. from embedchain.loaders.base_loader import BaseLoader
  7. from embedchain.loaders.directory_loader import DirectoryLoader
  8. @register_deserializable
  9. class DropboxLoader(BaseLoader):
  10. def __init__(self):
  11. access_token = os.environ.get("DROPBOX_ACCESS_TOKEN")
  12. if not access_token:
  13. raise ValueError("Please set the `DROPBOX_ACCESS_TOKEN` environment variable.")
  14. try:
  15. from dropbox import Dropbox, exceptions
  16. except ImportError:
  17. raise ImportError(
  18. 'Dropbox requires extra dependencies. Install with `pip install --upgrade "embedchain[dropbox]"`'
  19. )
  20. try:
  21. dbx = Dropbox(access_token)
  22. dbx.users_get_current_account()
  23. self.dbx = dbx
  24. except exceptions.AuthError as ex:
  25. raise ValueError("Invalid Dropbox access token. Please verify your token and try again.") from ex
  26. def _download_folder(self, path: str, local_root: str) -> List[FileMetadata]:
  27. """Download a folder from Dropbox and save it preserving the directory structure."""
  28. entries = self.dbx.files_list_folder(path).entries
  29. for entry in entries:
  30. local_path = os.path.join(local_root, entry.name)
  31. if isinstance(entry, FileMetadata):
  32. self.dbx.files_download_to_file(local_path, f"{path}/{entry.name}")
  33. else:
  34. os.makedirs(local_path, exist_ok=True)
  35. self._download_folder(f"{path}/{entry.name}", local_path)
  36. return entries
  37. def _generate_dir_id_from_all_paths(self, path: str) -> str:
  38. """Generate a unique ID for a directory based on all of its paths."""
  39. entries = self.dbx.files_list_folder(path).entries
  40. paths = [f"{path}/{entry.name}" for entry in entries]
  41. return hashlib.sha256("".join(paths).encode()).hexdigest()
  42. def load_data(self, path: str):
  43. """Load data from a Dropbox URL, preserving the folder structure."""
  44. root_dir = f"dropbox_{self._generate_dir_id_from_all_paths(path)}"
  45. os.makedirs(root_dir, exist_ok=True)
  46. for entry in self.dbx.files_list_folder(path).entries:
  47. local_path = os.path.join(root_dir, entry.name)
  48. if isinstance(entry, FileMetadata):
  49. self.dbx.files_download_to_file(local_path, f"{path}/{entry.name}")
  50. else:
  51. os.makedirs(local_path, exist_ok=True)
  52. self._download_folder(f"{path}/{entry.name}", local_path)
  53. dir_loader = DirectoryLoader()
  54. data = dir_loader.load_data(root_dir)["data"]
  55. # Clean up
  56. self._clean_directory(root_dir)
  57. return {
  58. "doc_id": hashlib.sha256(path.encode()).hexdigest(),
  59. "data": data,
  60. }
  61. def _clean_directory(self, dir_path):
  62. """Recursively delete a directory and its contents."""
  63. for item in os.listdir(dir_path):
  64. item_path = os.path.join(dir_path, item)
  65. if os.path.isdir(item_path):
  66. self._clean_directory(item_path)
  67. else:
  68. os.remove(item_path)
  69. os.rmdir(dir_path)