io.druid.storage.s3.S3DataSegmentPuller.java Source code

Java tutorial

Introduction

Here is the source code for io.druid.storage.s3.S3DataSegmentPuller.java

Source

/*
 * Druid - a distributed column store.
 * Copyright 2012 - 2015 Metamarkets Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.druid.storage.s3;

import com.google.common.base.Predicate;
import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import com.google.common.io.ByteSource;
import com.google.common.io.Files;
import com.google.inject.Inject;
import com.metamx.common.CompressionUtils;
import com.metamx.common.FileUtils;
import com.metamx.common.IAE;
import com.metamx.common.ISE;
import com.metamx.common.MapUtils;
import com.metamx.common.UOE;
import com.metamx.common.logger.Logger;
import io.druid.segment.loading.DataSegmentPuller;
import io.druid.segment.loading.SegmentLoadingException;
import io.druid.segment.loading.URIDataPuller;
import io.druid.timeline.DataSegment;
import org.jets3t.service.S3ServiceException;
import org.jets3t.service.ServiceException;
import org.jets3t.service.impl.rest.httpclient.RestS3Service;
import org.jets3t.service.model.S3Object;

import javax.tools.FileObject;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.io.Writer;
import java.net.URI;
import java.util.Map;
import java.util.concurrent.Callable;

/**
 * A data segment puller that also hanldes URI data pulls.
 */
public class S3DataSegmentPuller implements DataSegmentPuller, URIDataPuller {
    public static final int DEFAULT_RETRY_COUNT = 3;

    public static FileObject buildFileObject(final URI uri, final RestS3Service s3Client)
            throws S3ServiceException {
        final S3Coords coords = new S3Coords(checkURI(uri));
        final S3Object s3Obj = s3Client.getObject(coords.bucket, coords.path);
        final String path = uri.getPath();

        return new FileObject() {
            volatile boolean streamAcquired = false;

            @Override
            public URI toUri() {
                return uri;
            }

            @Override
            public String getName() {
                final String ext = Files.getFileExtension(path);
                return Files.getNameWithoutExtension(path) + (Strings.isNullOrEmpty(ext) ? "" : ("." + ext));
            }

            @Override
            public InputStream openInputStream() throws IOException {
                try {
                    streamAcquired = true;
                    return s3Obj.getDataInputStream();
                } catch (ServiceException e) {
                    throw new IOException(String.format("Could not load S3 URI [%s]", uri), e);
                }
            }

            @Override
            public OutputStream openOutputStream() throws IOException {
                throw new UOE("Cannot stream S3 output");
            }

            @Override
            public Reader openReader(boolean ignoreEncodingErrors) throws IOException {
                throw new UOE("Cannot open reader");
            }

            @Override
            public CharSequence getCharContent(boolean ignoreEncodingErrors) throws IOException {
                throw new UOE("Cannot open character sequence");
            }

            @Override
            public Writer openWriter() throws IOException {
                throw new UOE("Cannot open writer");
            }

            @Override
            public long getLastModified() {
                return s3Obj.getLastModifiedDate().getTime();
            }

            @Override
            public boolean delete() {
                throw new UOE(
                        "Cannot delete S3 items anonymously. jetS3t doesn't support authenticated deletes easily.");
            }

            @Override
            public void finalize() throws Throwable {
                try {
                    if (!streamAcquired) {
                        s3Obj.closeDataInputStream();
                    }
                } finally {
                    super.finalize();
                }
            }
        };
    }

    public static final String scheme = S3StorageDruidModule.SCHEME;

    private static final Logger log = new Logger(S3DataSegmentPuller.class);

    protected static final String BUCKET = "bucket";
    protected static final String KEY = "key";

    protected final RestS3Service s3Client;

    @Inject
    public S3DataSegmentPuller(RestS3Service s3Client) {
        this.s3Client = s3Client;
    }

    @Override
    public void getSegmentFiles(final DataSegment segment, final File outDir) throws SegmentLoadingException {
        getSegmentFiles(new S3Coords(segment), outDir);
    }

    public FileUtils.FileCopyResult getSegmentFiles(final S3Coords s3Coords, final File outDir)
            throws SegmentLoadingException {

        log.info("Pulling index at path[%s] to outDir[%s]", s3Coords, outDir);

        if (!isObjectInBucket(s3Coords)) {
            throw new SegmentLoadingException("IndexFile[%s] does not exist.", s3Coords);
        }

        if (!outDir.exists()) {
            outDir.mkdirs();
        }

        if (!outDir.isDirectory()) {
            throw new ISE("outDir[%s] must be a directory.", outDir);
        }

        try {
            final URI uri = URI.create(String.format("s3://%s/%s", s3Coords.bucket, s3Coords.path));
            final ByteSource byteSource = new ByteSource() {
                @Override
                public InputStream openStream() throws IOException {
                    try {
                        return buildFileObject(uri, s3Client).openInputStream();
                    } catch (ServiceException e) {
                        if (e.getCause() != null) {
                            if (S3Utils.S3RETRY.apply(e)) {
                                throw new IOException("Recoverable exception", e);
                            }
                        }
                        throw Throwables.propagate(e);
                    }
                }
            };
            if (CompressionUtils.isZip(s3Coords.path)) {
                final FileUtils.FileCopyResult result = CompressionUtils.unzip(byteSource, outDir, S3Utils.S3RETRY,
                        true);
                log.info("Loaded %d bytes from [%s] to [%s]", result.size(), s3Coords.toString(),
                        outDir.getAbsolutePath());
                return result;
            }
            if (CompressionUtils.isGz(s3Coords.path)) {
                final String fname = Files.getNameWithoutExtension(uri.getPath());
                final File outFile = new File(outDir, fname);

                final FileUtils.FileCopyResult result = CompressionUtils.gunzip(byteSource, outFile);
                log.info("Loaded %d bytes from [%s] to [%s]", result.size(), s3Coords.toString(),
                        outFile.getAbsolutePath());
                return result;
            }
            throw new IAE("Do not know how to load file type at [%s]", uri.toString());
        } catch (Exception e) {
            try {
                org.apache.commons.io.FileUtils.deleteDirectory(outDir);
            } catch (IOException ioe) {
                log.warn(ioe, "Failed to remove output directory [%s] for segment pulled from [%s]",
                        outDir.getAbsolutePath(), s3Coords.toString());
            }
            throw new SegmentLoadingException(e, e.getMessage());
        }
    }

    public static URI checkURI(URI uri) {
        if (uri.getScheme().equalsIgnoreCase(scheme)) {
            uri = URI.create("s3" + uri.toString().substring(scheme.length()));
        } else if (!uri.getScheme().equalsIgnoreCase("s3")) {
            throw new IAE("Don't know how to load scheme for URI [%s]", uri.toString());
        }
        return uri;
    }

    @Override
    public InputStream getInputStream(URI uri) throws IOException {
        try {
            return buildFileObject(uri, s3Client).openInputStream();
        } catch (ServiceException e) {
            throw new IOException(String.format("Could not load URI [%s]", uri.toString()), e);
        }
    }

    @Override
    public Predicate<Throwable> shouldRetryPredicate() {
        // Yay! smart retries!
        return new Predicate<Throwable>() {
            @Override
            public boolean apply(Throwable e) {
                if (e == null) {
                    return false;
                }
                if (e instanceof ServiceException) {
                    return S3Utils.isServiceExceptionRecoverable((ServiceException) e);
                }
                if (S3Utils.S3RETRY.apply(e)) {
                    return true;
                }
                // Look all the way down the cause chain, just in case something wraps it deep.
                return apply(e.getCause());
            }
        };
    }

    /**
     * Returns the "version" (aka last modified timestamp) of the URI
     *
     * @param uri The URI to check the last timestamp
     *
     * @return The time in ms of the last modification of the URI in String format
     *
     * @throws IOException
     */
    @Override
    public String getVersion(URI uri) throws IOException {
        try {
            final FileObject object = buildFileObject(uri, s3Client);
            // buildFileObject has a hidden input stream that gets open deep in jets3t. This helps prevent resource leaks
            try (InputStream nullStream = object.openInputStream()) {
                return String.format("%d", object.getLastModified());
            }
        } catch (S3ServiceException e) {
            if (S3Utils.isServiceExceptionRecoverable(e)) {
                // The recoverable logic is always true for IOException, so we want to only pass IOException if it is recoverable
                throw new IOException(
                        String.format("Could not fetch last modified timestamp from URI [%s]", uri.toString()), e);
            } else {
                throw Throwables.propagate(e);
            }
        }
    }

    private String toFilename(String key, final String suffix) {
        String filename = key.substring(key.lastIndexOf("/") + 1); // characters after last '/'
        filename = filename.substring(0, filename.length() - suffix.length()); // remove the suffix from the end
        return filename;
    }

    private boolean isObjectInBucket(final S3Coords coords) throws SegmentLoadingException {
        try {
            return S3Utils.retryS3Operation(new Callable<Boolean>() {
                @Override
                public Boolean call() throws Exception {
                    return S3Utils.isObjectInBucket(s3Client, coords.bucket, coords.path);
                }
            });
        } catch (S3ServiceException | IOException e) {
            throw new SegmentLoadingException(e, "S3 fail! Key[%s]", coords);
        } catch (Exception e) {
            throw Throwables.propagate(e);
        }
    }

    protected static class S3Coords {
        String bucket;
        String path;

        public S3Coords(URI uri) {
            if (!"s3".equalsIgnoreCase(uri.getScheme())) {
                throw new IAE("Unsupported scheme: [%s]", uri.getScheme());
            }
            bucket = uri.getHost();
            String path = uri.getPath();
            if (path.startsWith("/")) {
                path = path.substring(1);
            }
            this.path = path;
        }

        public S3Coords(DataSegment segment) {
            Map<String, Object> loadSpec = segment.getLoadSpec();
            bucket = MapUtils.getString(loadSpec, BUCKET);
            path = MapUtils.getString(loadSpec, KEY);
            if (path.startsWith("/")) {
                path = path.substring(1);
            }
        }

        public S3Coords(String bucket, String key) {
            this.bucket = bucket;
            this.path = key;
        }

        public String toString() {
            return String.format("s3://%s/%s", bucket, path);
        }
    }
}