google_drive.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. import hashlib
  2. import re
  3. try:
  4. from googleapiclient.errors import HttpError
  5. except ImportError:
  6. raise ImportError(
  7. "Google Drive requires extra dependencies. Install with `pip install embedchain[googledrive]`"
  8. ) from None
  9. from langchain_community.document_loaders import GoogleDriveLoader as Loader
  10. from langchain_community.document_loaders import UnstructuredFileIOLoader
  11. from embedchain.helpers.json_serializable import register_deserializable
  12. from embedchain.loaders.base_loader import BaseLoader
  13. @register_deserializable
  14. class GoogleDriveLoader(BaseLoader):
  15. @staticmethod
  16. def _get_drive_id_from_url(url: str):
  17. regex = r"^https:\/\/drive\.google\.com\/drive\/(?:u\/\d+\/)folders\/([a-zA-Z0-9_-]+)$"
  18. if re.match(regex, url):
  19. return url.split("/")[-1]
  20. raise ValueError(
  21. f"The url provided {url} does not match a google drive folder url. Example drive url: "
  22. f"https://drive.google.com/drive/u/0/folders/xxxx"
  23. )
  24. def load_data(self, url: str):
  25. """Load data from a Google drive folder."""
  26. folder_id: str = self._get_drive_id_from_url(url)
  27. try:
  28. loader = Loader(
  29. folder_id=folder_id,
  30. recursive=True,
  31. file_loader_cls=UnstructuredFileIOLoader,
  32. )
  33. data = []
  34. all_content = []
  35. docs = loader.load()
  36. for doc in docs:
  37. all_content.append(doc.page_content)
  38. # renames source to url for later use.
  39. doc.metadata["url"] = doc.metadata.pop("source")
  40. data.append({"content": doc.page_content, "meta_data": doc.metadata})
  41. doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
  42. return {"doc_id": doc_id, "data": data}
  43. except HttpError:
  44. raise FileNotFoundError("Unable to locate folder or files, check provided drive URL and try again")