com.ebay.erl.mobius.core.mapred.MobiusMultiInputs.java Source code

Java tutorial

Introduction

Here is the source code for com.ebay.erl.mobius.core.mapred.MobiusMultiInputs.java

Source

package com.ebay.erl.mobius.core.mapred;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.lib.MobiusDelegatingInputFormat;
import org.apache.hadoop.mapred.lib.MultipleInputs;

import com.ebay.erl.mobius.core.ConfigureConstants;

/**
 * {@link MultipleInputs} uses the input path to find the corresponding 
 * Map class.  Mobius need to find the {@link com.ebay.erl.mobius.core.criterion.TupleCriterion}, 
 * schema, or projection column for a given {@link com.ebay.erl.mobius.core.builder.Dataset}, 
 * and two different {@link com.ebay.erl.mobius.core.builder.Dataset} elements in a Mobius job 
 * might be processed by same type of {@link AbstractMobiusMapper}. For example, joining two 
 * different CSV dataset, both {@link com.ebay.erl.mobius.core.builder.Dataset} are processed by
 * {@link com.ebay.erl.mobius.core.mapred.TSVMapper}.
 * <p>
 * 
 * This class is used to build the path to dataset mapping, or vice versa, so
 * the mapper can use {@link com.ebay.erl.mobius.core.ConfigureConstants#CURRENT_DATASET_ID} to 
 * determine the projection columns, filters, and so on.
 * <p>
 * 
 * This class is used along with {@link MobiusDelegatingInputFormat}
 * 
 * <p>
 * This product is licensed under the Apache License,  Version 2.0, 
 * available at http://www.apache.org/licenses/LICENSE-2.0.
 * 
 * This product contains portions derived from Apache hadoop which is 
 * licensed under the Apache License, Version 2.0, available at 
 * http://hadoop.apache.org.
 * 
 *  2007  2012 eBay Inc., Evan Chiu, Woody Zhou, Neel Sundaresan
 * 
 * @see MobiusDelegatingInputFormat
 */
@SuppressWarnings({ "unchecked", "deprecation" })
public class MobiusMultiInputs {
    public static void addInputPath(JobConf conf, Path anInput, Class<? extends InputFormat> inputFormatClass,
            Class<? extends AbstractMobiusMapper> mapperClass, byte datasetID, FileSystem fs) throws IOException {
        MultipleInputs.addInputPath(conf, anInput, inputFormatClass, mapperClass);

        // override the {@link InputFormat} class set by the {@link MultipleInputs}
        // as Mobius need to set the set the current dataset id per input split.
        conf.setInputFormat(MobiusDelegatingInputFormat.class);

        // MobiusDelegatingInputFormat extends DelegatingInputFormat, which always
        // call the FileInpupt#setInputs within DelegatingInputFormat#getInputs
        // regardless of the actual type of <code>inputFormatClass</code>.

        /////////////////////////////////////////////////////
        // start to build the path to dataset ID mapping
        /////////////////////////////////////////////////////
        MultiInputsHelper helper = MultiInputsHelpersRepository.getInstance(conf).getHelper(inputFormatClass);
        URI uri = helper.getUniquePathByInputFormat(conf, anInput);
        String aPath = uri.toString();

        if (aPath.indexOf(";") >= 0)
            throw new IllegalArgumentException(aPath + " cannot contains semicolon");

        // set the input path to datasetID mapping in the Hadoop configuration.
        if (conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING, "").isEmpty()) {
            conf.set(ConfigureConstants.INPUT_TO_DATASET_MAPPING, datasetID + ";" + aPath);
        } else {
            String previous = conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING);
            conf.set(ConfigureConstants.INPUT_TO_DATASET_MAPPING, datasetID + ";" + aPath + "," + previous);
        }

        //LOGGER.debug(conf.get(ConfigureConstants.INPUT_TO_DATASET_MAPPING, ""));
    }
}