google_drive.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. import hashlib
  2. import re
  3. try:
  4. from googleapiclient.errors import HttpError
  5. except ImportError:
  6. raise ImportError(
  7. "Google Drive requires extra dependencies. Install with `pip install embedchain[googledrive]`"
  8. ) from None
  9. from langchain.document_loaders import GoogleDriveLoader as Loader, UnstructuredFileIOLoader
  10. from embedchain.helpers.json_serializable import register_deserializable
  11. from embedchain.loaders.base_loader import BaseLoader
  12. @register_deserializable
  13. class GoogleDriveLoader(BaseLoader):
  14. @staticmethod
  15. def _get_drive_id_from_url(url: str):
  16. regex = r"^https:\/\/drive\.google\.com\/drive\/(?:u\/\d+\/)folders\/([a-zA-Z0-9_-]+)$"
  17. if re.match(regex, url):
  18. return url.split("/")[-1]
  19. raise ValueError(
  20. f"The url provided {url} does not match a google drive folder url. Example drive url: "
  21. f"https://drive.google.com/drive/u/0/folders/xxxx"
  22. )
  23. def load_data(self, url: str):
  24. """Load data from a Google drive folder."""
  25. folder_id: str = self._get_drive_id_from_url(url)
  26. try:
  27. loader = Loader(
  28. folder_id=folder_id,
  29. recursive=True,
  30. file_loader_cls=UnstructuredFileIOLoader,
  31. )
  32. data = []
  33. all_content = []
  34. docs = loader.load()
  35. for doc in docs:
  36. all_content.append(doc.page_content)
  37. # renames source to url for later use.
  38. doc.metadata["url"] = doc.metadata.pop("source")
  39. data.append({"content": doc.page_content, "meta_data": doc.metadata})
  40. doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
  41. return {"doc_id": doc_id, "data": data}
  42. except HttpError:
  43. raise FileNotFoundError("Unable to locate folder or files, check provided drive URL and try again")