浏览代码

[Feature] Add Dropbox loader (#1073)

Co-authored-by: Deshraj Yadav <deshrajdry@gmail.com>
Sidharth Mohanty 1 年之前
父节点
当前提交
404e73af77

+ 27 - 0
docs/components/data-sources/dropbox.mdx

@@ -0,0 +1,27 @@
+---
+title: '💾 Dropbox'
+---
+
+To load folders or files from your Dropbox account, configure the `data_type` parameter as `dropbox` and specify the path to the desired file or folder, starting from the root directory of your Dropbox account.
+
+For Dropbox access, an **access token** is required. Obtain this token by visiting [Dropbox Developer Apps](https://www.dropbox.com/developers/apps). There, create a new app and generate an access token for it.
+
+Ensure your app has the following settings activated:
+
+- In the Permissions section, enable `files.content.read` and `files.metadata.read`.
+
+```python
+import os
+from embedchain import Pipeline as App
+
+os.environ["DROPBOX_ACCESS_TOKEN"] = "sl.xxx"
+os.environ["OPENAI_API_KEY"] = "sk-xxx"
+
+app = App()
+
+# any path from the root of your dropbox account, you can leave it "" for the root folder
+app.add("/test", data_type="dropbox")
+
+print(app.query("Which two celebrities are mentioned here?"))
+# The two celebrities mentioned in the given context are Elon Musk and Jeff Bezos.
+```

+ 1 - 0
docs/components/data-sources/overview.mdx

@@ -31,6 +31,7 @@ Embedchain comes with built-in support for various data sources. We handle the c
   <Card title="📝 Substack" href="/components/data-sources/substack"></Card>
   <Card title="🐝 Beehiiv" href="/components/data-sources/beehiiv"></Card>
   <Card title="📁 Directory" href="/components/data-sources/directory"></Card>
+  <Card title="💾 Dropbox" href="/components/data-sources/dropbox"></Card>
 </CardGroup>
 
 <br/ >

+ 2 - 1
docs/mint.json

@@ -131,7 +131,8 @@
                 "components/data-sources/substack",
                 "components/data-sources/discord",
                 "components/data-sources/beehiiv",
-                "components/data-sources/directory"
+                "components/data-sources/directory",
+                "components/data-sources/dropbox"
               ]
             },
             "components/data-sources/data-type-handling"

+ 2 - 0
embedchain/data_formatter/data_formatter.py

@@ -76,6 +76,7 @@ class DataFormatter(JSONSerializable):
             DataType.BEEHIIV: "embedchain.loaders.beehiiv.BeehiivLoader",
             DataType.DIRECTORY: "embedchain.loaders.directory_loader.DirectoryLoader",
             DataType.SLACK: "embedchain.loaders.slack.SlackLoader",
+            DataType.DROPBOX: "embedchain.loaders.dropbox.DropboxLoader",
             DataType.TEXT_FILE: "embedchain.loaders.text_file.TextFileLoader",
         }
 
@@ -121,6 +122,7 @@ class DataFormatter(JSONSerializable):
             DataType.BEEHIIV: "embedchain.chunkers.beehiiv.BeehiivChunker",
             DataType.DIRECTORY: "embedchain.chunkers.common_chunker.CommonChunker",
             DataType.SLACK: "embedchain.chunkers.common_chunker.CommonChunker",
+            DataType.DROPBOX: "embedchain.chunkers.common_chunker.CommonChunker",
             DataType.TEXT_FILE: "embedchain.chunkers.common_chunker.CommonChunker",
         }
 

+ 82 - 0
embedchain/loaders/dropbox.py

@@ -0,0 +1,82 @@
+import hashlib
+import os
+from typing import List
+
+from dropbox.files import FileMetadata
+
+from embedchain.helpers.json_serializable import register_deserializable
+from embedchain.loaders.base_loader import BaseLoader
+from embedchain.loaders.directory_loader import DirectoryLoader
+
+
+@register_deserializable
+class DropboxLoader(BaseLoader):
+    def __init__(self):
+        access_token = os.environ.get("DROPBOX_ACCESS_TOKEN")
+        if not access_token:
+            raise ValueError("Please set the `DROPBOX_ACCESS_TOKEN` environment variable.")
+        try:
+            from dropbox import Dropbox, exceptions
+        except ImportError:
+            raise ImportError(
+                'Dropbox requires extra dependencies. Install with `pip install --upgrade "embedchain[dropbox]"`'
+            )
+
+        try:
+            dbx = Dropbox(access_token)
+            dbx.users_get_current_account()
+            self.dbx = dbx
+        except exceptions.AuthError as ex:
+            raise ValueError("Invalid Dropbox access token. Please verify your token and try again.") from ex
+
+    def _download_folder(self, path: str, local_root: str) -> List[FileMetadata]:
+        """Download a folder from Dropbox and save it preserving the directory structure."""
+        entries = self.dbx.files_list_folder(path).entries
+        for entry in entries:
+            local_path = os.path.join(local_root, entry.name)
+            if isinstance(entry, FileMetadata):
+                self.dbx.files_download_to_file(local_path, f"{path}/{entry.name}")
+            else:
+                os.makedirs(local_path, exist_ok=True)
+                self._download_folder(f"{path}/{entry.name}", local_path)
+        return entries
+
+    def _generate_dir_id_from_all_paths(self, path: str) -> str:
+        """Generate a unique ID for a directory based on all of its paths."""
+        entries = self.dbx.files_list_folder(path).entries
+        paths = [f"{path}/{entry.name}" for entry in entries]
+        return hashlib.sha256("".join(paths).encode()).hexdigest()
+
+    def load_data(self, path: str):
+        """Load data from a Dropbox URL, preserving the folder structure."""
+        root_dir = f"dropbox_{self._generate_dir_id_from_all_paths(path)}"
+        os.makedirs(root_dir, exist_ok=True)
+
+        for entry in self.dbx.files_list_folder(path).entries:
+            local_path = os.path.join(root_dir, entry.name)
+            if isinstance(entry, FileMetadata):
+                self.dbx.files_download_to_file(local_path, f"{path}/{entry.name}")
+            else:
+                os.makedirs(local_path, exist_ok=True)
+                self._download_folder(f"{path}/{entry.name}", local_path)
+
+        dir_loader = DirectoryLoader()
+        data = dir_loader.load_data(root_dir)["data"]
+
+        # Clean up
+        self._clean_directory(root_dir)
+
+        return {
+            "doc_id": hashlib.sha256(path.encode()).hexdigest(),
+            "data": data,
+        }
+
+    def _clean_directory(self, dir_path):
+        """Recursively delete a directory and its contents."""
+        for item in os.listdir(dir_path):
+            item_path = os.path.join(dir_path, item)
+            if os.path.isdir(item_path):
+                self._clean_directory(item_path)
+            else:
+                os.remove(item_path)
+        os.rmdir(dir_path)

+ 2 - 0
embedchain/models/data_type.py

@@ -37,6 +37,7 @@ class IndirectDataType(Enum):
     BEEHIIV = "beehiiv"
     DIRECTORY = "directory"
     SLACK = "slack"
+    DROPBOX = "dropbox"
     TEXT_FILE = "text_file"
 
 
@@ -74,4 +75,5 @@ class DataType(Enum):
     BEEHIIV = IndirectDataType.BEEHIIV.value
     DIRECTORY = IndirectDataType.DIRECTORY.value
     SLACK = IndirectDataType.SLACK.value
+    DROPBOX = IndirectDataType.DROPBOX.value
     TEXT_FILE = IndirectDataType.TEXT_FILE.value

+ 48 - 3
poetry.lock

@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
 [[package]]
 name = "aiofiles"
@@ -1269,6 +1269,23 @@ files = [
     {file = "docx2txt-0.8.tar.gz", hash = "sha256:2c06d98d7cfe2d3947e5760a57d924e3ff07745b379c8737723922e7009236e5"},
 ]
 
+[[package]]
+name = "dropbox"
+version = "11.36.2"
+description = "Official Dropbox API Client"
+optional = true
+python-versions = "*"
+files = [
+    {file = "dropbox-11.36.2-py2-none-any.whl", hash = "sha256:afbfce2589b777ade1deaa2c186f3650c41e41cea0f1fac497a75112a171f8e2"},
+    {file = "dropbox-11.36.2-py3-none-any.whl", hash = "sha256:a21e4d2bcbeb1d8067ff87969aea48792c9a8266182491153feff2be9c1b9c8f"},
+    {file = "dropbox-11.36.2.tar.gz", hash = "sha256:d48d3d16d486c78b11c14a1c4a28a2611fbf5a0d0a358b861bfd9482e603c500"},
+]
+
+[package.dependencies]
+requests = ">=2.16.2"
+six = ">=1.12.0"
+stone = ">=2"
+
 [[package]]
 name = "duckduckgo-search"
 version = "3.9.3"
@@ -4165,10 +4182,10 @@ files = [
 
 [package.dependencies]
 numpy = [
+    {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
     {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
     {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
     {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
-    {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
     {version = ">=1.23.5", markers = "python_version >= \"3.11\""},
 ]
 
@@ -4577,6 +4594,17 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
+[[package]]
+name = "ply"
+version = "3.11"
+description = "Python Lex & Yacc"
+optional = true
+python-versions = "*"
+files = [
+    {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"},
+    {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"},
+]
+
 [[package]]
 name = "portalocker"
 version = "2.8.2"
@@ -6625,6 +6653,22 @@ typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""
 [package.extras]
 full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"]
 
+[[package]]
+name = "stone"
+version = "3.3.1"
+description = "Stone is an interface description language (IDL) for APIs."
+optional = true
+python-versions = "*"
+files = [
+    {file = "stone-3.3.1-py2-none-any.whl", hash = "sha256:cd2f7f9056fc39b16c8fd46a26971dc5ccd30b5c2c246566cd2c0dd27ff96609"},
+    {file = "stone-3.3.1-py3-none-any.whl", hash = "sha256:e15866fad249c11a963cce3bdbed37758f2e88c8ff4898616bc0caeb1e216047"},
+    {file = "stone-3.3.1.tar.gz", hash = "sha256:4ef0397512f609757975f7ec09b35639d72ba7e3e17ce4ddf399578346b4cb50"},
+]
+
+[package.dependencies]
+ply = ">=3.4"
+six = ">=1.12.0"
+
 [[package]]
 name = "sympy"
 version = "1.12"
@@ -8115,6 +8159,7 @@ cohere = ["cohere"]
 community = ["llama-hub"]
 dataloaders = ["docx2txt", "duckduckgo-search", "pytube", "sentence-transformers", "unstructured", "youtube-transcript-api"]
 discord = ["discord"]
+dropbox = ["dropbox"]
 elasticsearch = ["elasticsearch"]
 github = ["PyGithub", "gitpython"]
 gmail = ["llama-hub", "requests"]
@@ -8144,4 +8189,4 @@ youtube = ["youtube-transcript-api", "yt_dlp"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.12"
-content-hash = "bc763595ae5e903a5a819a2d3f31f045fac52555f72e44ead9df0e5e191955aa"
+content-hash = "335c42c91a2b5e4a1c3d8a7c39dee8665fd1eee0410e1bc6cb6cb1d6f6722445"

+ 2 - 0
pyproject.toml

@@ -147,6 +147,7 @@ newspaper3k = { version = "^0.2.8", optional = true }
 listparser = { version = "^0.19", optional = true }
 google-generativeai = { version = "^0.3.0", optional = true }
 modal = { version = "^0.56.4329", optional = true }
+dropbox = { version = "^11.36.2", optional = true }
 
 [tool.poetry.group.dev.dependencies]
 black = "^23.3.0"
@@ -214,6 +215,7 @@ rss_feed = [
 ]
 google = ["google-generativeai"]
 modal = ["modal"]
+dropbox = ["dropbox"]
 
 [tool.poetry.group.docs.dependencies]
 

+ 85 - 0
tests/loaders/test_dropbox.py

@@ -0,0 +1,85 @@
+import os
+from unittest.mock import MagicMock
+
+import pytest
+from dropbox.files import FileMetadata
+
+from embedchain.loaders.dropbox import DropboxLoader
+
+
+@pytest.fixture
+def setup_dropbox_loader(mocker):
+    mock_dropbox = mocker.patch("dropbox.Dropbox")
+    mock_dbx = mocker.MagicMock()
+    mock_dropbox.return_value = mock_dbx
+
+    os.environ["DROPBOX_ACCESS_TOKEN"] = "test_token"
+    loader = DropboxLoader()
+
+    yield loader, mock_dbx
+
+    if "DROPBOX_ACCESS_TOKEN" in os.environ:
+        del os.environ["DROPBOX_ACCESS_TOKEN"]
+
+
+def test_initialization(setup_dropbox_loader):
+    """Test initialization of DropboxLoader."""
+    loader, _ = setup_dropbox_loader
+    assert loader is not None
+
+
+def test_download_folder(setup_dropbox_loader, mocker):
+    """Test downloading a folder."""
+    loader, mock_dbx = setup_dropbox_loader
+    mocker.patch("os.makedirs")
+    mocker.patch("os.path.join", return_value="mock/path")
+
+    mock_file_metadata = mocker.MagicMock(spec=FileMetadata)
+    mock_dbx.files_list_folder.return_value.entries = [mock_file_metadata]
+
+    entries = loader._download_folder("path/to/folder", "local_root")
+    assert entries is not None
+
+
+def test_generate_dir_id_from_all_paths(setup_dropbox_loader, mocker):
+    """Test directory ID generation."""
+    loader, mock_dbx = setup_dropbox_loader
+    mock_file_metadata = mocker.MagicMock(spec=FileMetadata, name="file.txt")
+    mock_dbx.files_list_folder.return_value.entries = [mock_file_metadata]
+
+    dir_id = loader._generate_dir_id_from_all_paths("path/to/folder")
+    assert dir_id is not None
+    assert len(dir_id) == 64
+
+
+def test_clean_directory(setup_dropbox_loader, mocker):
+    """Test cleaning up a directory."""
+    loader, _ = setup_dropbox_loader
+    mocker.patch("os.listdir", return_value=["file1", "file2"])
+    mocker.patch("os.remove")
+    mocker.patch("os.rmdir")
+
+    loader._clean_directory("path/to/folder")
+
+
+def test_load_data(mocker, setup_dropbox_loader, tmp_path):
+    loader = setup_dropbox_loader[0]
+
+    mock_file_metadata = MagicMock(spec=FileMetadata, name="file.txt")
+    mocker.patch.object(loader.dbx, "files_list_folder", return_value=MagicMock(entries=[mock_file_metadata]))
+    mocker.patch.object(loader.dbx, "files_download_to_file")
+
+    # Mock DirectoryLoader
+    mock_data = {"data": "test_data"}
+    mocker.patch("embedchain.loaders.directory_loader.DirectoryLoader.load_data", return_value=mock_data)
+
+    test_dir = tmp_path / "dropbox_test"
+    test_dir.mkdir()
+    test_file = test_dir / "file.txt"
+    test_file.write_text("dummy content")
+    mocker.patch.object(loader, "_generate_dir_id_from_all_paths", return_value=str(test_dir))
+
+    result = loader.load_data("path/to/folder")
+
+    assert result == {"doc_id": mocker.ANY, "data": "test_data"}
+    loader.dbx.files_list_folder.assert_called_once_with("path/to/folder")