org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputSplitFormat.java Source code

Introduction

Here is the source code for org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigInputSplitFormat.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.backend.hadoop.executionengine.mapReduceLayer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.pig.CollectableLoadFunc;
import org.apache.pig.FuncSpec;
import org.apache.pig.IndexableLoadFunc;
import org.apache.pig.LoadFunc;
import org.apache.pig.OrderedLoadFunc;
import org.apache.pig.PigConfiguration;
import org.apache.pig.PigException;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.backend.hadoop.executionengine.shims.HadoopShims;
import org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.impl.plan.OperatorKey;
import org.apache.pig.impl.util.ObjectSerializer;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;

public class PigInputSplitFormat extends FileInputFormat<Text, Tuple> {

    public static final Log log = LogFactory.getLog(PigInputSplitFormat.class);

    public static final String PIG_INPUTS = "pig.inputs";

    /**
     * @deprecated Use {@link UDFContext} instead in the following way to get
     *             the job's {@link Configuration}:
     * 
     *             <pre>
     * UdfContext.getUdfContext().getJobConf()
     * </pre>
     */
    @Deprecated
    public static Configuration sJob;

    /*
     * (non-Javadoc)
     * 
     * @see
     * org.apache.hadoop.mapreduce.InputFormat#createRecordReader(org.apache
     * .hadoop.mapreduce.InputSplit,
     * org.apache.hadoop.mapreduce.TaskAttemptContext)
     */
    @SuppressWarnings({ "rawtypes", "unchecked" })
    @Override
    public org.apache.hadoop.mapreduce.RecordReader<Text, Tuple> createRecordReader(
            org.apache.hadoop.mapreduce.InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        // We need to create a TaskAttemptContext based on the Configuration
        // which
        // was used in the getSplits() to produce the split supplied here. For
        // this, let's find out the input of the script which produced the split
        // supplied here and then get the corresponding Configuration and setup
        // TaskAttemptContext based on it and then call the real InputFormat's
        // createRecordReader() method

        PigSplit pigSplit = (PigSplit) split;
        activeSplit = pigSplit;
        // XXX hadoop 20 new API integration: get around a hadoop 20 bug by
        // passing total # of splits to each split so it can be retrieved
        // here and set it to the configuration object. This number is needed
        // by PoissonSampleLoader to compute the number of samples
        int n = pigSplit.getTotalSplits();
        context.getConfiguration().setInt("pig.mapsplits.count", n);
        Configuration conf = context.getConfiguration();
        LoadFunc loadFunc = getLoadFunc(pigSplit.getInputIndex(), conf);
        // Pass loader signature to LoadFunc and to InputFormat through
        // the conf
        passLoadSignature(loadFunc, pigSplit.getInputIndex(), conf);

        // merge entries from split specific conf into the conf we got
        PigInputSplitFormat.mergeSplitSpecificConf(loadFunc, pigSplit, conf);

        // for backward compatibility
        PigInputSplitFormat.sJob = conf;

        InputFormat inputFormat = loadFunc.getInputFormat();

        List<Long> inpLimitLists = (ArrayList<Long>) ObjectSerializer.deserialize(conf.get("pig.inpLimits"));

        return new PigRecordReader(inputFormat, pigSplit, loadFunc, context,
                inpLimitLists.get(pigSplit.getInputIndex()));
    }

    /**
     * get the corresponding configuration for the input on which the split is
     * based and merge it with the Conf supplied
     * 
     * package level access so that this is not publicly used elsewhere
     * 
     * @throws IOException
     */
    static void mergeSplitSpecificConf(LoadFunc loadFunc, PigSplit pigSplit, Configuration originalConf)
            throws IOException {
        // set up conf with entries from input specific conf
        Job job = new Job(originalConf);
        loadFunc.setLocation(getLoadLocation(pigSplit.getInputIndex(), originalConf), job);
        // The above setLocation call could write to the conf within
        // the job - merge that updated conf with original conf
        ConfigurationUtil.mergeConf(originalConf, job.getConfiguration());

    }

    /**
     * @param inputIndex
     * @param conf
     * @return
     * @throws IOException
     */
    @SuppressWarnings("unchecked")
    private static LoadFunc getLoadFunc(int inputIndex, Configuration conf) throws IOException {
        ArrayList<FileSpec> inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize(conf.get(PIG_INPUTS));
        FuncSpec loadFuncSpec = inputs.get(inputIndex).getFuncSpec();
        return (LoadFunc) PigContext.instantiateFuncFromSpec(loadFuncSpec);
    }

    @SuppressWarnings("unchecked")
    private static String getLoadLocation(int inputIndex, Configuration conf) throws IOException {
        ArrayList<FileSpec> inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize(conf.get(PIG_INPUTS));
        return inputs.get(inputIndex).getFileName();
    }

    /**
     * Pass loader signature to LoadFunc and to InputFormat through the conf
     * 
     * @param loadFunc
     *            the Loadfunc to set the signature on
     * @param inputIndex
     *            the index of the input corresponding to the loadfunc
     * @param conf
     *            the Configuration object into which the signature should be
     *            set
     * @throws IOException
     *             on failure
     */
    @SuppressWarnings("unchecked")
    static void passLoadSignature(LoadFunc loadFunc, int inputIndex, Configuration conf) throws IOException {
        List<String> inpSignatureLists = (ArrayList<String>) ObjectSerializer
                .deserialize(conf.get("pig.inpSignatures"));
        // signature can be null for intermediate jobs where it will not
        // be required to be passed down
        if (inpSignatureLists.get(inputIndex) != null) {
            loadFunc.setUDFContextSignature(inpSignatureLists.get(inputIndex));
            conf.set("pig.loader.signature", inpSignatureLists.get(inputIndex));
        }

        MapRedUtil.setupUDFContext(conf);
    }

    private static final double SPLIT_SLOP = 1.1;

    protected Random random = new Random();

    protected void addSplit(List<InputSplit> splits, FileSplit inputSplit) {
        // if (random.nextInt(limr) <= threshold) {
        splits.add(inputSplit);
        // }
    }

    /**
     * This function returns the sample split due to the chunksize
     * and the rate.
     * @param job
     * @return
     * @throws IOException
     */
    public List<InputSplit> getSplitsSample(JobContext job) throws IOException {
        long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
        long maxSize = getMaxSplitSize(job);
        float rate = job.getConfiguration().getFloat(PigConfiguration.PIG_H2IRG_ROLLUP_RATE, 0);
        double maximumSamplingSize = 0;
        String inputFile = job.getConfiguration().get("pig.input.dirs", "");
        if (inputFile != "") {
            Path pPivot = new Path(inputFile);
            FileSystem fs = FileSystem.get(job.getConfiguration());
            FileStatus stt = fs.getFileStatus(pPivot);
            long fileLength = stt.getLen();
            maximumSamplingSize = fileLength * rate;
        }

        ArrayList<ArrayList<InputSplit>> splitArray = null;

        int noOfSizes = 0;
        String splitSizes[] = null;
        long preVariableSizes[] = null;
        String variableSplit = job.getConfiguration().get("pig.h2irg.rollup.variablesplit", "");
        if (variableSplit.equals("")) {
            noOfSizes = 4;
            preVariableSizes = new long[4];
            preVariableSizes[0] = 256;
            preVariableSizes[1] = 512;
            preVariableSizes[2] = 1024;
            preVariableSizes[3] = 2048;
        } else {
            splitSizes = variableSplit.split(",");
            noOfSizes = splitSizes.length;
            preVariableSizes = new long[noOfSizes];
            for (int i = 0; i < noOfSizes; i++) {
                preVariableSizes[i] = Long.parseLong(splitSizes[i]);
            }
        }

        long postVariableSizes[] = new long[noOfSizes];
        long oneKB = 1024;
        for (int i = 0; i < noOfSizes; i++) {
            postVariableSizes[i] = preVariableSizes[i] * oneKB;
        }

        splitArray = new ArrayList<ArrayList<InputSplit>>();
        for (int i = 0; i < noOfSizes + 1; i++) {
            ArrayList<InputSplit> single = new ArrayList<InputSplit>();
            splitArray.add(single);
        }

        for (FileStatus file : listStatus(job)) {
            Path path = file.getPath();
            FileSystem fs = path.getFileSystem(job.getConfiguration());
            long length = file.getLen();
            BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
            if ((length != 0) && isSplitable(job, path)) {
                long bytesRemaining = length;
                long totalBytes = 0;
                int count = noOfSizes;
                for (int i = 0; i < noOfSizes; i++) {
                    while (totalBytes < length / noOfSizes) {
                        int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                        addSplit(splitArray.get(i), new FileSplit(path, length - bytesRemaining,
                                postVariableSizes[i], blkLocations[blkIndex].getHosts()));
                        bytesRemaining -= postVariableSizes[i];
                        totalBytes += postVariableSizes[i];
                    }
                    count--;
                    bytesRemaining = count * (length / noOfSizes);
                    totalBytes = 0;
                }

                if (bytesRemaining != 0) {
                    addSplit(splitArray.get(splitArray.size() - 1), new FileSplit(path, length - bytesRemaining,
                            bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()));
                }
            } else if (length != 0) {
                addSplit(splitArray.get(splitArray.size() - 1),
                        new FileSplit(path, 0, length, blkLocations[0].getHosts()));
            } else {
                addSplit(splitArray.get(splitArray.size() - 1), new FileSplit(path, 0, length, new String[0]));
            }
        }

        for (int i = 0; i < noOfSizes; i++) {
            log.info("Total # of " + postVariableSizes[i] + " splits: " + splitArray.get(i).size());
            Collections.shuffle(splitArray.get(i));
        }

        List<InputSplit> splitsReturn = new ArrayList<InputSplit>();

        for (int i = 0; i < noOfSizes; i++) {
            int noSampleSplit = (int) Math.ceil(rate * splitArray.get(i).size());
            if (noSampleSplit == 0)
                noSampleSplit = 1;
            for (int j = 0; j < noSampleSplit; j++) {
                splitsReturn.add(splitArray.get(i).get(j));
            }
        }

        log.info("Total # of sampling splits: " + splitsReturn.size());

        return splitsReturn;
    }

    /*
     * (non-Javadoc)
     * 
     * @see
     * org.apache.hadoop.mapreduce.InputFormat#getSplits(org.apache.hadoop.mapreduce
     * .JobContext)
     */
    @SuppressWarnings({ "unchecked", "rawtypes" })
    @Override
    public List<InputSplit> getSplits(JobContext jobcontext) throws IOException {

        Configuration conf = jobcontext.getConfiguration();

        ArrayList<FileSpec> inputs;
        ArrayList<ArrayList<OperatorKey>> inpTargets;
        PigContext pigContext;
        try {
            inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize(conf.get("pig.inputs"));
            inpTargets = (ArrayList<ArrayList<OperatorKey>>) ObjectSerializer
                    .deserialize(conf.get("pig.inpTargets"));
            pigContext = (PigContext) ObjectSerializer.deserialize(conf.get("pig.pigContext"));
            PigContext.setPackageImportList(
                    (ArrayList<String>) ObjectSerializer.deserialize(conf.get("udf.import.list")));
            MapRedUtil.setupUDFContext(conf);
        } catch (Exception e) {
            int errCode = 2094;
            String msg = "Unable to deserialize object.";
            throw new ExecException(msg, errCode, PigException.BUG, e);
        }

        ArrayList<InputSplit> splits = new ArrayList<InputSplit>();
        for (int i = 0; i < inputs.size(); i++) {
            try {
                Path path = new Path(inputs.get(i).getFileName());

                FileSystem fs;
                boolean isFsPath = true;
                try {
                    fs = path.getFileSystem(conf);
                } catch (Exception e) {
                    // If an application specific
                    // scheme was used
                    // (e.g.: "hbase://table") we will fail
                    // getting the file system. That's
                    // ok, we just use the dfs in that case.
                    fs = new Path("/").getFileSystem(conf);
                    isFsPath = false;
                }

                // if the execution is against Mapred DFS, set
                // working dir to /user/<userid>
                if (!Utils.isLocal(pigContext, conf)) {
                    fs.setWorkingDirectory(jobcontext.getWorkingDirectory());
                }

                // first pass input location to the loader - for this send a
                // clone of the configuration we have - this is so that if the
                // loader (or the inputformat of the loader) decide to store the
                // input location into the configuration (for example,
                // FileInputFormat stores this in mapred.input.dir in the conf),
                // then for different inputs, the loader's don't end up
                // over-writing the same conf.
                FuncSpec loadFuncSpec = inputs.get(i).getFuncSpec();
                LoadFunc loadFunc = (LoadFunc) PigContext.instantiateFuncFromSpec(loadFuncSpec);
                boolean combinable = !(loadFunc instanceof MergeJoinIndexer || loadFunc instanceof IndexableLoadFunc
                        || (loadFunc instanceof CollectableLoadFunc && loadFunc instanceof OrderedLoadFunc));
                if (combinable)
                    combinable = !conf.getBoolean("pig.noSplitCombination", false);
                Configuration confClone = new Configuration(conf);
                Job inputSpecificJob = new Job(confClone);
                // Pass loader signature to LoadFunc and to InputFormat through
                // the conf
                passLoadSignature(loadFunc, i, inputSpecificJob.getConfiguration());
                loadFunc.setLocation(inputs.get(i).getFileName(), inputSpecificJob);
                // The above setLocation call could write to the conf within
                // the inputSpecificJob - use this updated conf

                // get the InputFormat from it and ask for splits
                InputFormat inpFormat = loadFunc.getInputFormat();
                // List<InputSplit> oneInputSplits = inpFormat.getSplits(
                // HadoopShims.createJobContext(inputSpecificJob.getConfiguration(),
                // jobcontext.getJobID()));

                List<InputSplit> oneInputSplits = getSplitsSample(jobcontext);

                List<InputSplit> oneInputPigSplits = getPigSplits(oneInputSplits, i, inpTargets.get(i),
                        HadoopShims.getDefaultBlockSize(fs, isFsPath ? path : fs.getWorkingDirectory()), combinable,
                        confClone);
                splits.addAll(oneInputPigSplits);
            } catch (ExecException ee) {
                throw ee;
            } catch (Exception e) {
                int errCode = 2118;
                String msg = "Unable to create input splits for: " + inputs.get(i).getFileName();
                if (e.getMessage() != null && (!e.getMessage().isEmpty())) {
                    throw new ExecException(e.getMessage(), errCode, PigException.BUG, e);
                } else {
                    throw new ExecException(msg, errCode, PigException.BUG, e);
                }
            }
        }

        // XXX hadoop 20 new API integration: get around a hadoop 20 bug by
        // passing total # of splits to each split so that it can be retrieved
        // in the RecordReader method when called by mapreduce framework later.
        int n = splits.size();
        // also passing the multi-input flag to the back-end so that
        // the multi-input record counters can be created
        int m = inputs.size();

        boolean disableCounter = conf.getBoolean("pig.disable.counter", false);
        if ((m > 1) && disableCounter) {
            log.info("Disable Pig custom input counters");
        }

        for (InputSplit split : splits) {
            ((PigSplit) split).setTotalSplits(n);
            if (m > 1)
                ((PigSplit) split).setMultiInputs(true);
            ((PigSplit) split).setDisableCounter(disableCounter);
        }
        // shuffle --> return splits
        return splits;
    }

    protected List<InputSplit> getPigSplits(List<InputSplit> oneInputSplits, int inputIndex,
            ArrayList<OperatorKey> targetOps, long blockSize, boolean combinable, Configuration conf)
            throws IOException, InterruptedException {
        ArrayList<InputSplit> pigSplits = new ArrayList<InputSplit>();
        if (!combinable) {
            int splitIndex = 0;
            for (InputSplit inputSplit : oneInputSplits) {
                PigSplit pigSplit = new PigSplit(new InputSplit[] { inputSplit }, inputIndex, targetOps,
                        splitIndex++);
                pigSplit.setConf(conf);
                pigSplits.add(pigSplit);
            }
            return pigSplits;
        } else {
            long maxCombinedSplitSize = conf.getLong("pig.maxCombinedSplitSize", 0);
            if (maxCombinedSplitSize == 0)
                // default is the block size
                maxCombinedSplitSize = 1024;
            List<List<InputSplit>> combinedSplits = MapRedUtil.getCombinePigSplits(oneInputSplits,
                    maxCombinedSplitSize, conf);
            for (int i = 0; i < combinedSplits.size(); i++)
                pigSplits.add(createPigSplit(combinedSplits.get(i), inputIndex, targetOps, i, conf));
            return pigSplits;
        }
    }

    private InputSplit createPigSplit(List<InputSplit> combinedSplits, int inputIndex,
            ArrayList<OperatorKey> targetOps, int splitIndex, Configuration conf) {
        PigSplit pigSplit = new PigSplit(combinedSplits.toArray(new InputSplit[0]), inputIndex, targetOps,
                splitIndex);
        pigSplit.setConf(conf);
        return pigSplit;
    }

    public static PigSplit getActiveSplit() {
        return activeSplit;
    }

    private static PigSplit activeSplit;

}