gobblin.source.extractor.extract.google.GoogleDriveFileSystem.java Source code

Java tutorial

Introduction

Here is the source code for gobblin.source.extractor.extract.google.GoogleDriveFileSystem.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package gobblin.source.extractor.extract.google;

import java.io.BufferedInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.util.Progressable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.api.client.auth.oauth2.Credential;
import com.google.api.client.googleapis.json.GoogleJsonError;
import com.google.api.client.googleapis.json.GoogleJsonResponseException;
import com.google.api.services.drive.Drive;
import com.google.api.services.drive.model.File;
import com.google.api.services.drive.model.FileList;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.io.Closer;

import static gobblin.configuration.ConfigurationKeys.*;
import static gobblin.source.extractor.extract.google.GoogleCommonKeys.*;
import gobblin.configuration.State;
import gobblin.util.HadoopUtils;
import gobblin.util.io.SeekableFSInputStream;

/**
 * A {@link FileSystem} implementation that provides the {@link FileSystem} interface for an Google Drive server.
 * <ul>
 * <li>Note that {@link GoogleDriveFileSystem} currently only supports list, get, delete use cases.
 * <li>Google drive has two different identifier. File ID and File name -- where folder is just different mime-type of File.
 * As File name can be duplicate under same folder, all path that GoogleDriveFileSystem takes assumes that it's a File ID.
 * <li>It is the caller's responsibility to call {@link #close()} on this {@link FileSystem} to disconnect the session.
 *
 * {@link GoogleDriveFileSystem} does not cache instance and {@link FileSystem#get(Configuration)} creates a new {@link GoogleDriveFileSystem} everytime
 * instead of cached copy.
 * </ul>
 */
public class GoogleDriveFileSystem extends FileSystem {
    private static final Logger LOG = LoggerFactory.getLogger(GoogleDriveFileSystem.class);

    static final String PAGE_SIZE = GOOGLE_SOURCE_PREFIX + "fs_helper.page_size"; //for paginated API
    static final String FOLDER_MIME_TYPE = "application/vnd.google-apps.folder";
    static final int DEFAULT_PAGE_SIZE = 50;

    private Drive client;
    private final Closer closer;
    private int pageSize = DEFAULT_PAGE_SIZE;

    public GoogleDriveFileSystem(Drive client) {
        this();
        this.client = client;
    }

    public GoogleDriveFileSystem(Drive client, int pageSize) {
        this(client);
        Preconditions.checkArgument(pageSize > 0, "pageSize should be positive number");
        this.pageSize = pageSize;
    }

    public GoogleDriveFileSystem() {
        super();
        this.closer = Closer.create();
    }

    @Override
    public synchronized void initialize(URI uri, Configuration conf) throws IOException {
        if (this.client == null) {
            super.initialize(uri, conf);
            State state = HadoopUtils.getStateFromConf(conf);
            Credential credential = new GoogleCommon.CredentialBuilder(state.getProp(SOURCE_CONN_PRIVATE_KEY),
                    state.getPropAsList(API_SCOPES)).fileSystemUri(state.getProp(PRIVATE_KEY_FILESYSTEM_URI))
                            .proxyUrl(state.getProp(SOURCE_CONN_USE_PROXY_URL))
                            .port(state.getProp(SOURCE_CONN_USE_PROXY_PORT))
                            .serviceAccountId(state.getProp(SOURCE_CONN_USERNAME)).build();

            this.client = new Drive.Builder(credential.getTransport(), GoogleCommon.getJsonFactory(), credential)
                    .setApplicationName(Preconditions.checkNotNull(state.getProp(APPLICATION_NAME),
                            "ApplicationName is required"))
                    .build();
            this.pageSize = state.getPropAsInt(PAGE_SIZE, DEFAULT_PAGE_SIZE);
        }
    }

    @Override
    public FSDataInputStream open(Path path, int bufferSize) throws IOException {
        return closer.register(new FSDataInputStream(new SeekableFSInputStream(new BufferedInputStream(
                client.files().get(toFileId(path)).executeMediaAsInputStream(), bufferSize))));
    }

    @Override
    public FSDataInputStream open(Path path) throws IOException {
        return closer.register(new FSDataInputStream(new SeekableFSInputStream(
                new BufferedInputStream(client.files().get(toFileId(path)).executeMediaAsInputStream()))));
    }

    @Override
    public boolean delete(Path path, boolean recursive) throws IOException {
        Preconditions.checkArgument(recursive, "Non-recursive is not supported.");
        String fileId = toFileId(path);
        LOG.debug("Deleting file: " + fileId);
        try {
            client.files().delete(fileId).execute();
        } catch (GoogleJsonResponseException e) {
            GoogleJsonError error = e.getDetails();
            if (404 == error.getCode()) { //Non-existing file id
                return false;
            }
            throw e;
        }
        return true;
    }

    @Override
    public FileStatus[] listStatus(Path path) throws FileNotFoundException, IOException {
        String folderId = toFileId(path);

        List<File> fileMetadata = lsFileMetadata(folderId, null);
        if (fileMetadata.isEmpty()) {
            throw new FileNotFoundException();
        }
        FileStatus[] statusArr = new FileStatus[fileMetadata.size()];
        int idx = 0;
        for (File metadata : fileMetadata) {
            FileStatus status = toFileStatus(metadata);
            statusArr[idx++] = status;
        }
        return statusArr;
    }

    private List<File> lsFileMetadata(String folderId, String fileId) throws IOException {
        String pageToken = null;
        List<File> result = new ArrayList<>();
        Optional<String> query = buildQuery(folderId, fileId);

        do {
            Drive.Files.List request = client.files().list()
                    .setFields("files/id,files/mimeType,files/modifiedTime,files/size,files/permissions")
                    .setPageSize(pageSize);
            if (query.isPresent()) {
                request = request.setQ(query.get());
            }

            if (pageToken != null) {
                request = request.setPageToken(pageToken);
            }

            LOG.info("Google drive List request: " + request);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Google drive List request: " + request);
            }

            FileList fileList = null;
            try {
                fileList = request.execute();
            } catch (GoogleJsonResponseException e) {
                GoogleJsonError error = e.getDetails();
                if (404 == error.getCode()) {
                    throw new FileNotFoundException("File not found. Request: " + request);
                }
                throw e;
            }

            pageToken = fileList.getNextPageToken();

            List<File> files = fileList.getFiles();
            if (files == null || files.isEmpty()) {
                return result;
            }
            result.addAll(files);
        } while (pageToken != null);
        return result;
    }

    /**
     * Build query for Google drive.
     * @see https://developers.google.com/drive/v3/web/search-parameters
     *
     * @param folderId
     * @param fileName
     * @return Query
     */
    @VisibleForTesting
    Optional<String> buildQuery(String folderId, String fileName) {
        if (StringUtils.isEmpty(folderId) && StringUtils.isEmpty(fileName)) {
            return Optional.absent();
        }

        StringBuilder query = new StringBuilder();
        if (StringUtils.isNotEmpty(folderId)) {
            query.append("'").append(folderId).append("'").append(" in parents");
        }
        if (StringUtils.isNotEmpty(fileName)) {
            if (query.length() > 0) {
                query.append(" and ");
            }
            query.append("name contains ").append("'").append(fileName).append("'");
        }
        return Optional.of(query.toString());
    }

    /**
     * org.apache.hadoop.fs.Path assumes that there separator in file system naming and "/" is the separator.
     * When org.apache.hadoop.fs.Path sees "/" in path String, it splits into parent and name. As fileID is a random
     * String determined by Google and it can contain "/" itself, this method check if parent and name is separated and
     * restore "/" back to file ID.
     *
     * @param p
     * @return
     */
    public static String toFileId(Path p) {
        if (p.isRoot()) {
            return "";
        }
        final String format = "%s" + Path.SEPARATOR + "%s";
        if (p.getParent() != null && StringUtils.isEmpty(p.getParent().getName())) {
            return p.getName();
        }
        return String.format(format, toFileId(p.getParent()), p.getName());
    }

    @Override
    public void close() throws IOException {
        super.close();
        closer.close();
    }

    @Override
    public FileStatus getFileStatus(Path p) throws IOException {
        Preconditions.checkNotNull(p);
        String fileId = toFileId(p);
        File metadata = client.files().get(fileId).setFields("id,mimeType,modifiedTime,size,permissions").execute();
        return toFileStatus(metadata);
    }

    private FileStatus toFileStatus(File metadata) {
        return new FileStatus(metadata.getSize() == null ? 0L : metadata.getSize(),
                FOLDER_MIME_TYPE.equals(metadata.getMimeType()), -1, -1, metadata.getModifiedTime().getValue(),
                new Path(metadata.getId()));
    }

    //Below are unsupported methods

    @Override
    public void setWorkingDirectory(Path new_dir) {
        throw new UnsupportedOperationException();
    }

    @Override
    public Path getWorkingDirectory() {
        throw new UnsupportedOperationException();
    }

    @Override
    public boolean mkdirs(Path f, FsPermission permission) throws IOException {
        throw new UnsupportedOperationException();
    }

    @Override
    public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize,
            short replication, long blockSize, Progressable progress) throws IOException {
        throw new UnsupportedOperationException();
    }

    @Override
    public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException {
        throw new UnsupportedOperationException();
    }

    @Override
    public boolean rename(Path src, Path dst) throws IOException {
        throw new UnsupportedOperationException();
    }

    @Override
    public URI getUri() {
        throw new UnsupportedOperationException();
    }
}