cascading.tap.hadoop.io.MultiInputFormat.java Source code

Introduction

Here is the source code for cascading.tap.hadoop.io.MultiInputFormat.java
Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.tap.hadoop.io;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;

import cascading.CascadingException;
import cascading.flow.hadoop.util.HadoopUtil;
import cascading.util.Util;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.jets3t.service.S3ServiceException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Class MultiInputFormat accepts multiple InputFormat class declarations allowing a single MR job
 * to read data from incompatible file types.
 */
public class MultiInputFormat implements InputFormat {
    /** Field LOG */
    private static final Logger LOG = LoggerFactory.getLogger(MultiInputFormat.class);

    /**
     * Used to set the current JobConf with all sub jobs configurations.
     *
     * @param toJob
     * @param fromJobs
     */
    public static void addInputFormat(JobConf toJob, JobConf... fromJobs) {
        toJob.setInputFormat(MultiInputFormat.class);
        List<Map<String, String>> configs = new ArrayList<Map<String, String>>();
        List<Path> allPaths = new ArrayList<Path>();

        boolean isLocal = false;

        for (JobConf fromJob : fromJobs) {
            if (fromJob.get("mapred.input.format.class") == null)
                throw new CascadingException(
                        "mapred.input.format.class is required, should be set in source Scheme#sourceConfInit");

            configs.add(HadoopUtil.getConfig(toJob, fromJob));
            Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob));

            if (!isLocal)
                isLocal = HadoopUtil.isLocal(fromJob);
        }

        if (!allPaths.isEmpty()) // it's possible there aren't any
            FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()]));

        try {
            toJob.set("cascading.multiinputformats", HadoopUtil.serializeBase64(configs, toJob, true));
        } catch (IOException exception) {
            throw new CascadingException("unable to pack input formats", exception);
        }

        if (isLocal)
            HadoopUtil.setLocal(toJob);
    }

    static InputFormat[] getInputFormats(JobConf[] jobConfs) {
        InputFormat[] inputFormats = new InputFormat[jobConfs.length];

        for (int i = 0; i < jobConfs.length; i++)
            inputFormats[i] = jobConfs[i].getInputFormat();

        return inputFormats;
    }

    private List<Map<String, String>> getConfigs(JobConf job) throws IOException {
        return (List<Map<String, String>>) HadoopUtil.deserializeBase64(job.get("cascading.multiinputformats"), job,
                ArrayList.class, true);
    }

    public void validateInput(JobConf job) throws IOException {
        // do nothing, is deprecated
    }

    /**
     * Method getSplits delegates to the appropriate InputFormat.
     *
     * @param job       of type JobConf
     * @param numSplits of type int
     * @return InputSplit[]
     * @throws IOException when
     */
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
        numSplits = numSplits == 0 ? 1 : numSplits;

        List<Map<String, String>> configs = getConfigs(job);
        JobConf[] jobConfs = HadoopUtil.getJobConfs(job, configs);
        InputFormat[] inputFormats = getInputFormats(jobConfs);

        // if only one InputFormat, just return what ever it suggests
        if (inputFormats.length == 1)
            return collapse(getSplits(inputFormats, jobConfs, new int[] { numSplits }), configs);

        int[] indexedSplits = new int[inputFormats.length];

        // if we need only a few, the return one for each
        if (numSplits <= inputFormats.length) {
            Arrays.fill(indexedSplits, 1);
            return collapse(getSplits(inputFormats, jobConfs, indexedSplits), configs);
        }

        // attempt to get splits proportionally sized per input format
        long[] inputSplitSizes = getInputSplitSizes(inputFormats, jobConfs, numSplits);
        long totalSplitSize = sum(inputSplitSizes);

        if (totalSplitSize == 0) {
            Arrays.fill(indexedSplits, 1);
            return collapse(getSplits(inputFormats, jobConfs, indexedSplits), configs);
        }

        for (int i = 0; i < inputSplitSizes.length; i++) {
            int useSplits = (int) Math.ceil((double) numSplits * inputSplitSizes[i] / (double) totalSplitSize);
            indexedSplits[i] = useSplits == 0 ? 1 : useSplits;
        }

        return collapse(getSplits(inputFormats, jobConfs, indexedSplits), configs);
    }

    private long sum(long[] inputSizes) {
        long size = 0;

        for (long inputSize : inputSizes)
            size += inputSize;

        return size;
    }

    private InputSplit[] collapse(InputSplit[][] splits, List<Map<String, String>> configs) {
        List<InputSplit> splitsList = new ArrayList<InputSplit>();

        for (int i = 0; i < splits.length; i++) {
            Map<String, String> config = configs.get(i);

            config.remove("mapred.input.dir"); // this is a redundant value, will show up cluster side
            config.remove("mapreduce.input.fileinputformat.inputdir"); // hadoop2

            InputSplit[] split = splits[i];

            for (int j = 0; j < split.length; j++)
                splitsList.add(new MultiInputSplit(split[j], config));
        }

        return splitsList.toArray(new InputSplit[splitsList.size()]);
    }

    private InputSplit[][] getSplits(InputFormat[] inputFormats, JobConf[] jobConfs, int[] numSplits)
            throws IOException {
        InputSplit[][] inputSplits = new InputSplit[inputFormats.length][];

        for (int i = 0; i < inputFormats.length; i++) {
            inputSplits[i] = inputFormats[i].getSplits(jobConfs[i], numSplits[i]);

            // it's reasonable the split array is empty, but really shouldn't be null
            if (inputSplits[i] == null)
                inputSplits[i] = new InputSplit[0];

            for (int j = 0; j < inputSplits[i].length; j++) {
                if (inputSplits[i][j] == null)
                    throw new IllegalStateException("input format: " + inputFormats[i].getClass().getName()
                            + ", returned a split array with nulls");
            }
        }

        return inputSplits;
    }

    private long[] getInputSplitSizes(InputFormat[] inputFormats, JobConf[] jobConfs, int numSplits)
            throws IOException {
        long[] inputSizes = new long[inputFormats.length];

        for (int i = 0; i < inputFormats.length; i++) {
            InputFormat inputFormat = inputFormats[i];
            InputSplit[] splits = inputFormat.getSplits(jobConfs[i], numSplits);

            inputSizes[i] = splits.length;
        }

        return inputSizes;
    }

    /**
     * Method getRecordReader delegates to the appropriate InputFormat.
     *
     * @param split    of type InputSplit
     * @param job      of type JobConf
     * @param reporter of type Reporter
     * @return RecordReader
     * @throws IOException when
     */
    public RecordReader getRecordReader(InputSplit split, JobConf job, final Reporter reporter) throws IOException {
        final MultiInputSplit multiSplit = (MultiInputSplit) split;
        final JobConf currentConf = HadoopUtil.mergeConf(job, multiSplit.config, true);

        try {
            return Util.retry(LOG, 3, 20, "unable to get record reader", new Util.RetryOperator<RecordReader>() {

                @Override
                public RecordReader operate() throws Exception {
                    return currentConf.getInputFormat().getRecordReader(multiSplit.inputSplit, currentConf,
                            reporter);
                }

                @Override
                public boolean rethrow(Exception exception) {
                    return !(exception.getCause() instanceof S3ServiceException);
                }
            });
        } catch (Exception exception) {
            if (exception instanceof RuntimeException)
                throw (RuntimeException) exception;
            else
                throw (IOException) exception;
        }
    }
}