Source code for langchain_community.document_loaders.googledrive

# Prerequisites:
# 1. Create a Google Cloud project
# 2. Enable the Google Drive API:
#   https://console.cloud.google.com/flows/enableapi?apiid=drive.googleapis.com
# 3. Authorize credentials for desktop app:
#   https://developers.google.com/drive/api/quickstart/python#authorize_credentials_for_a_desktop_application # noqa: E501
# 4. For service accounts visit
#   https://cloud.google.com/iam/docs/service-accounts-create

from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Union

from langchain_core._api.deprecation import deprecated
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator, validator

from langchain_community.document_loaders.base import BaseLoader

SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]


[docs]@deprecated( since="0.0.32", removal="0.3.0", alternative_import="langchain_google_community.GoogleDriveLoader", ) class GoogleDriveLoader(BaseLoader, BaseModel): """Load Google Docs from `Google Drive`.""" service_account_key: Path = Path.home() / ".credentials" / "keys.json" """Path to the service account key file.""" credentials_path: Path = Path.home() / ".credentials" / "credentials.json" """Path to the credentials file.""" token_path: Path = Path.home() / ".credentials" / "token.json" """Path to the token file.""" folder_id: Optional[str] = None """The folder id to load from.""" document_ids: Optional[List[str]] = None """The document ids to load from.""" file_ids: Optional[List[str]] = None """The file ids to load from.""" recursive: bool = False """Whether to load recursively. Only applies when folder_id is given.""" file_types: Optional[Sequence[str]] = None """The file types to load. Only applies when folder_id is given.""" load_trashed_files: bool = False """Whether to load trashed files. Only applies when folder_id is given.""" # NOTE(MthwRobinson) - changing the file_loader_cls to type here currently # results in pydantic validation errors file_loader_cls: Any = None """The file loader class to use.""" file_loader_kwargs: Dict["str", Any] = {} """The file loader kwargs to use.""" @root_validator(pre=True) def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]: """Validate that either folder_id or document_ids is set, but not both.""" if values.get("folder_id") and ( values.get("document_ids") or values.get("file_ids") ): raise ValueError( "Cannot specify both folder_id and document_ids nor " "folder_id and file_ids" ) if ( not values.get("folder_id") and not values.get("document_ids") and not values.get("file_ids") ): raise ValueError("Must specify either folder_id, document_ids, or file_ids") file_types = values.get("file_types") if file_types: if values.get("document_ids") or values.get("file_ids"): raise ValueError( "file_types can only be given when folder_id is given," " (not when document_ids or file_ids are given)." ) type_mapping = { "document": "application/vnd.google-apps.document", "sheet": "application/vnd.google-apps.spreadsheet", "pdf": "application/pdf", } allowed_types = list(type_mapping.keys()) + list(type_mapping.values()) short_names = ", ".join([f"'{x}'" for x in type_mapping.keys()]) full_names = ", ".join([f"'{x}'" for x in type_mapping.values()]) for file_type in file_types: if file_type not in allowed_types: raise ValueError( f"Given file type {file_type} is not supported. " f"Supported values are: {short_names}; and " f"their full-form names: {full_names}" ) # replace short-form file types by full-form file types def full_form(x: str) -> str: return type_mapping[x] if x in type_mapping else x values["file_types"] = [full_form(file_type) for file_type in file_types] return values @validator("credentials_path") def validate_credentials_path(cls, v: Any, **kwargs: Any) -> Any: """Validate that credentials_path exists.""" if not v.exists(): raise ValueError(f"credentials_path {v} does not exist") return v def _load_credentials(self) -> Any: """Load credentials. The order of loading credentials: 1. Service account key if file exists 2. Token path (for OAuth Client) if file exists 3. Credentials path (for OAuth Client) if file exists 4. Default credentials. if no credentials found, raise DefaultCredentialsError """ # Adapted from https://developers.google.com/drive/api/v3/quickstart/python try: from google.auth import default from google.auth.transport.requests import Request from google.oauth2 import service_account from google.oauth2.credentials import Credentials from google_auth_oauthlib.flow import InstalledAppFlow except ImportError: raise ImportError( "You must run " "`pip install --upgrade " "google-api-python-client google-auth-httplib2 " "google-auth-oauthlib` " "to use the Google Drive loader." ) creds = None # From service account if self.service_account_key.exists(): return service_account.Credentials.from_service_account_file( str(self.service_account_key), scopes=SCOPES ) # From Oauth Client if self.token_path.exists(): creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES) if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) elif self.credentials_path.exists(): flow = InstalledAppFlow.from_client_secrets_file( str(self.credentials_path), SCOPES ) creds = flow.run_local_server(port=0) if creds: with open(self.token_path, "w") as token: token.write(creds.to_json()) # From Application Default Credentials if not creds: creds, _ = default(scopes=SCOPES) return creds def _load_sheet_from_id(self, id: str) -> List[Document]: """Load a sheet and all tabs from an ID.""" from googleapiclient.discovery import build creds = self._load_credentials() sheets_service = build("sheets", "v4", credentials=creds) spreadsheet = sheets_service.spreadsheets().get(spreadsheetId=id).execute() sheets = spreadsheet.get("sheets", []) documents = [] for sheet in sheets: sheet_name = sheet["properties"]["title"] result = ( sheets_service.spreadsheets() .values() .get(spreadsheetId=id, range=sheet_name) .execute() ) values = result.get("values", []) if not values: continue # empty sheet header = values[0] for i, row in enumerate(values[1:], start=1): metadata = { "source": ( f"https://docs.google.com/spreadsheets/d/{id}/" f"edit?gid={sheet['properties']['sheetId']}" ), "title": f"{spreadsheet['properties']['title']} - {sheet_name}", "row": i, } content = [] for j, v in enumerate(row): title = header[j].strip() if len(header) > j else "" content.append(f"{title}: {v.strip()}") page_content = "\n".join(content) documents.append(Document(page_content=page_content, metadata=metadata)) return documents def _load_document_from_id(self, id: str) -> Document: """Load a document from an ID.""" from io import BytesIO from googleapiclient.discovery import build from googleapiclient.errors import HttpError from googleapiclient.http import MediaIoBaseDownload creds = self._load_credentials() service = build("drive", "v3", credentials=creds) file = ( service.files() .get(fileId=id, supportsAllDrives=True, fields="modifiedTime,name") .execute() ) request = service.files().export_media(fileId=id, mimeType="text/plain") fh = BytesIO() downloader = MediaIoBaseDownload(fh, request) done = False try: while done is False: status, done = downloader.next_chunk() except HttpError as e: if e.resp.status == 404: print("File not found: {}".format(id)) # noqa: T201 else: print("An error occurred: {}".format(e)) # noqa: T201 text = fh.getvalue().decode("utf-8") metadata = { "source": f"https://docs.google.com/document/d/{id}/edit", "title": f"{file.get('name')}", "when": f"{file.get('modifiedTime')}", } return Document(page_content=text, metadata=metadata) def _load_documents_from_folder( self, folder_id: str, *, file_types: Optional[Sequence[str]] = None ) -> List[Document]: """Load documents from a folder.""" from googleapiclient.discovery import build creds = self._load_credentials() service = build("drive", "v3", credentials=creds) files = self._fetch_files_recursive(service, folder_id) # If file types filter is provided, we'll filter by the file type. if file_types: _files = [f for f in files if f["mimeType"] in file_types] # type: ignore else: _files = files returns = [] for file in _files: if file["trashed"] and not self.load_trashed_files: continue elif file["mimeType"] == "application/vnd.google-apps.document": returns.append(self._load_document_from_id(file["id"])) # type: ignore elif file["mimeType"] == "application/vnd.google-apps.spreadsheet": returns.extend(self._load_sheet_from_id(file["id"])) # type: ignore elif ( file["mimeType"] == "application/pdf" or self.file_loader_cls is not None ): returns.extend(self._load_file_from_id(file["id"])) # type: ignore else: pass return returns def _fetch_files_recursive( self, service: Any, folder_id: str ) -> List[Dict[str, Union[str, List[str]]]]: """Fetch all files and subfolders recursively.""" results = ( service.files() .list( q=f"'{folder_id}' in parents", pageSize=1000, includeItemsFromAllDrives=True, supportsAllDrives=True, fields="nextPageToken, files(id, name, mimeType, parents, trashed)", ) .execute() ) files = results.get("files", []) returns = [] for file in files: if file["mimeType"] == "application/vnd.google-apps.folder": if self.recursive: returns.extend(self._fetch_files_recursive(service, file["id"])) else: returns.append(file) return returns def _load_documents_from_ids(self) -> List[Document]: """Load documents from a list of IDs.""" if not self.document_ids: raise ValueError("document_ids must be set") return [self._load_document_from_id(doc_id) for doc_id in self.document_ids] def _load_file_from_id(self, id: str) -> List[Document]: """Load a file from an ID.""" from io import BytesIO from googleapiclient.discovery import build from googleapiclient.http import MediaIoBaseDownload creds = self._load_credentials() service = build("drive", "v3", credentials=creds) file = service.files().get(fileId=id, supportsAllDrives=True).execute() request = service.files().get_media(fileId=id) fh = BytesIO() downloader = MediaIoBaseDownload(fh, request) done = False while done is False: status, done = downloader.next_chunk() if self.file_loader_cls is not None: fh.seek(0) loader = self.file_loader_cls(file=fh, **self.file_loader_kwargs) docs = loader.load() for doc in docs: doc.metadata["source"] = f"https://drive.google.com/file/d/{id}/view" if "title" not in doc.metadata: doc.metadata["title"] = f"{file.get('name')}" return docs else: from PyPDF2 import PdfReader content = fh.getvalue() pdf_reader = PdfReader(BytesIO(content)) return [ Document( page_content=page.extract_text(), metadata={ "source": f"https://drive.google.com/file/d/{id}/view", "title": f"{file.get('name')}", "page": i, }, ) for i, page in enumerate(pdf_reader.pages) ] def _load_file_from_ids(self) -> List[Document]: """Load files from a list of IDs.""" if not self.file_ids: raise ValueError("file_ids must be set") docs = [] for file_id in self.file_ids: docs.extend(self._load_file_from_id(file_id)) return docs
[docs] def load(self) -> List[Document]: """Load documents.""" if self.folder_id: return self._load_documents_from_folder( self.folder_id, file_types=self.file_types ) elif self.document_ids: return self._load_documents_from_ids() else: return self._load_file_from_ids()