com.conductor.s3.S3OptimizedFileInputFormatMRV1.java Source code

Introduction

Here is the source code for com.conductor.s3.S3OptimizedFileInputFormatMRV1.java
Source

/**
 * Copyright 2015 Conductor, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 *
 */
package com.conductor.s3;

import java.io.IOException;
import java.util.List;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.s3native.S3NativeFileSystemConfigKeys;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.mapreduce.Job;

import com.amazonaws.services.s3.AmazonS3;
import com.google.common.collect.Lists;

/**
 * A {@link FileInputFormat} (MRV1 API) that is optimized for S3-based input, and supports recursive discovery of input
 * files given a single parent directory.
 * <p>
 * Job start-up time is much faster because this input format uses the {@link com.amazonaws.services.s3.AmazonS3} client
 * to discover job input files rather than the {@link org.apache.hadoop.fs.s3.S3FileSystem}.
 * <p>
 * This {@link FileInputFormat} supports adding just the top-level "directory" (i.e. a single S3 prefix) as file input;
 * it will recursively discover all files under the given prefix. This is <em>much</em> faster than adding individual
 * files to the job.
 * 
 * @author cgreen
 * @see S3SequenceFileInputFormatMRV1
 * @see S3TextFileInputFormatMRV1
 */
public abstract class S3OptimizedFileInputFormatMRV1<K, V> extends FileInputFormat<K, V> {

    @Override
    protected FileStatus[] listStatus(final JobConf job) throws IOException {
        final Path[] dirs = getInputPaths(job);
        if (dirs.length == 0) {
            throw new IOException("No input paths specified in job");
        }
        final long blockSize = job.getLong(S3NativeFileSystemConfigKeys.S3_NATIVE_BLOCK_SIZE_KEY,
                S3NativeFileSystemConfigKeys.S3_NATIVE_BLOCK_SIZE_DEFAULT);
        final AmazonS3 s3Client = S3HadoopUtils.getS3Client(job);
        final List<FileStatus> result = S3InputFormatUtils.getFileStatuses(s3Client, blockSize, dirs);
        return result.toArray(new FileStatus[result.size()]);
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public InputSplit[] getSplits(final JobConf job, final int numSplits) throws IOException {
        long minSize = getMinSplitSize(job);
        long maxSize = getMaxSplitSize(job);
        final List<FileStatus> fileStatuses = Lists.newArrayList(listStatus(job));
        final List<InputSplit> splits = S3InputFormatUtils.convertToInputSplitsMRV1(fileStatuses, minSize, maxSize);

        // Save the number of input files in the job-conf
        job.setLong("mapreduce.input.num.files", fileStatuses.size());

        return splits.toArray(new InputSplit[splits.size()]);
    }

    /**
     * Set the minimum input split size
     *
     * @param job
     *            the job to modify
     * @param size
     *            the minimum size
     */
    public static void setMinInputSplitSize(Job job, long size) {
        job.getConfiguration().setLong("mapred.min.split.size", size);
    }

    /**
     * Get the minimum split size
     *
     * @param job
     *            the job
     * @return the minimum number of bytes that can be in a split
     */
    public static long getMinSplitSize(JobConf job) {
        return job.getLong("mapred.min.split.size", 1L);
    }

    /**
     * Set the maximum split size
     *
     * @param job
     *            the job to modify
     * @param size
     *            the maximum split size
     */
    public static void setMaxInputSplitSize(Job job, long size) {
        job.getConfiguration().setLong("mapred.max.split.size", size);
    }

    /**
     * Get the maximum split size.
     *
     * @param context
     *            the job to look at.
     * @return the maximum number of bytes a split can include
     */
    public static long getMaxSplitSize(JobConf context) {
        return context.getLong("mapred.max.split.size", Long.MAX_VALUE);
    }

}