google_drive.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. import hashlib
  2. import re
  3. try:
  4. from googleapiclient.errors import HttpError
  5. except ImportError:
  6. raise ImportError(
  7. "Google Drive requires extra dependencies. Install with `pip install embedchain[googledrive]`"
  8. ) from None
  9. from langchain_community.document_loaders import GoogleDriveLoader as Loader
  10. try:
  11. import unstructured # noqa: F401
  12. from langchain_community.document_loaders import UnstructuredFileIOLoader
  13. except ImportError:
  14. raise ImportError(
  15. 'Unstructured file requires extra dependencies. Install with `pip install "unstructured[local-inference, all-docs]"`' # noqa: E501
  16. ) from None
  17. from embedchain.helpers.json_serializable import register_deserializable
  18. from embedchain.loaders.base_loader import BaseLoader
  19. @register_deserializable
  20. class GoogleDriveLoader(BaseLoader):
  21. @staticmethod
  22. def _get_drive_id_from_url(url: str):
  23. regex = r"^https:\/\/drive\.google\.com\/drive\/(?:u\/\d+\/)folders\/([a-zA-Z0-9_-]+)$"
  24. if re.match(regex, url):
  25. return url.split("/")[-1]
  26. raise ValueError(
  27. f"The url provided {url} does not match a google drive folder url. Example drive url: "
  28. f"https://drive.google.com/drive/u/0/folders/xxxx"
  29. )
  30. def load_data(self, url: str):
  31. """Load data from a Google drive folder."""
  32. folder_id: str = self._get_drive_id_from_url(url)
  33. try:
  34. loader = Loader(
  35. folder_id=folder_id,
  36. recursive=True,
  37. file_loader_cls=UnstructuredFileIOLoader,
  38. )
  39. data = []
  40. all_content = []
  41. docs = loader.load()
  42. for doc in docs:
  43. all_content.append(doc.page_content)
  44. # renames source to url for later use.
  45. doc.metadata["url"] = doc.metadata.pop("source")
  46. data.append({"content": doc.page_content, "meta_data": doc.metadata})
  47. doc_id = hashlib.sha256((" ".join(all_content) + url).encode()).hexdigest()
  48. return {"doc_id": doc_id, "data": data}
  49. except HttpError:
  50. raise FileNotFoundError("Unable to locate folder or files, check provided drive URL and try again")