123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354 |
- import hashlib
- import re
- try:
- from googleapiclient.errors import HttpError
- except ImportError:
- raise ImportError(
- "Google Drive requires extra dependencies. Install with `pip install embedchain[googledrive]`"
- ) from None
- from langchain.document_loaders import GoogleDriveLoader as Loader, UnstructuredFileIOLoader
- from embedchain.helpers.json_serializable import register_deserializable
- from embedchain.loaders.base_loader import BaseLoader
- @register_deserializable
- class GoogleDriveLoader(BaseLoader):
- @staticmethod
- def _get_drive_id_from_url(url: str):
- regex = r"^https:\/\/drive\.google\.com\/drive\/(?:u\/\d+\/)folders\/([a-zA-Z0-9_-]+)$"
- if re.match(regex, url):
- return url.split("/")[-1]
- raise ValueError(
- f"The url provided {url} does not match a google drive folder url. Example drive url: "
- f"https://drive.google.com/drive/u/0/folders/xxxx"
- )
- def load_data(self, url: str):
- """Load data from a Google drive folder."""
- folder_id: str = self._get_drive_id_from_url(url)
- try:
- loader = Loader(
- folder_id=folder_id,
- recursive=True,
- file_loader_cls=UnstructuredFileIOLoader,
- )
- data = []
- all_content = []
- docs = loader.load()
- for doc in docs:
- all_content.append(doc.page_content)
- # renames source to url for later use.
- doc.metadata["url"] = doc.metadata.pop("source")
- data.append({"content": doc.page_content, "meta_data": doc.metadata})
- doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
- return {"doc_id": doc_id, "data": data}
- except HttpError:
- raise FileNotFoundError("Unable to locate folder or files, check provided drive URL and try again")
|