org.apache.pig.backend.hadoop.executionengine.mapreduceExec.PigInputFormat.java Source code

Introduction

Here is the source code for org.apache.pig.backend.hadoop.executionengine.mapreduceExec.PigInputFormat.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.backend.hadoop.executionengine.mapreduceExec;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.pig.Slice;
import org.apache.pig.backend.datastorage.DataStorage;
import org.apache.pig.backend.executionengine.PigSlicer;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.backend.hadoop.datastorage.HDataStorage;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.eval.EvalSpec;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.impl.io.ValidatingInputFileSpec;
import org.apache.pig.impl.util.ObjectSerializer;

public class PigInputFormat implements InputFormat<Text, Tuple>, JobConfigurable {

    @SuppressWarnings("unchecked")
    public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
        boolean isSplittable = job.getBoolean("pig.input.splittable", true);
        ArrayList<FileSpec> inputs = (ArrayList<FileSpec>) ObjectSerializer.deserialize(job.get("pig.inputs"));
        ArrayList<EvalSpec> mapFuncs = (ArrayList<EvalSpec>) ObjectSerializer
                .deserialize(job.get("pig.mapFuncs", ""));
        ArrayList<EvalSpec> groupFuncs = (ArrayList<EvalSpec>) ObjectSerializer
                .deserialize(job.get("pig.groupFuncs", ""));

        PigContext pigContext = (PigContext) ObjectSerializer.deserialize(job.get("pig.pigContext"));
        // TODO: don't understand this code
        // added for UNION: set group func arity to match arity of inputs
        if (groupFuncs != null && groupFuncs.size() != inputs.size()) {
            groupFuncs = new ArrayList<EvalSpec>();
            for (int i = 0; i < groupFuncs.size(); i++) {
                groupFuncs.set(i, null);
            }
        }

        if (inputs.size() != mapFuncs.size()) {
            StringBuilder sb = new StringBuilder();
            sb.append("number of inputs != number of map functions: ");
            sb.append(inputs.size());
            sb.append(" != ");
            sb.append(mapFuncs.size());
            sb.append(": ");
            sb.append(job.get("pig.mapFuncs", "missing"));
            throw new IOException(sb.toString());
        }
        if (groupFuncs != null && inputs.size() != groupFuncs.size()) {
            StringBuilder sb = new StringBuilder();
            sb.append("number of inputs != number of group functions: ");
            sb.append(inputs.size());
            sb.append(" != ");
            sb.append(groupFuncs.size());
            throw new IOException(sb.toString());
        }

        FileSystem fs = FileSystem.get(job);
        List<SliceWrapper> splits = new ArrayList<SliceWrapper>();
        for (int i = 0; i < inputs.size(); i++) {
            DataStorage store = new HDataStorage(ConfigurationUtil.toProperties(job));
            ValidatingInputFileSpec spec;
            if (inputs.get(i) instanceof ValidatingInputFileSpec) {
                spec = (ValidatingInputFileSpec) inputs.get(i);
            } else {
                spec = new ValidatingInputFileSpec(inputs.get(i), store);
            }
            EvalSpec groupBy = groupFuncs == null ? null : groupFuncs.get(i);
            if (isSplittable && (spec.getSlicer() instanceof PigSlicer)) {
                ((PigSlicer) spec.getSlicer()).setSplittable(isSplittable);
            }
            Slice[] pigs = spec.getSlicer().slice(store, spec.getFileName());
            for (Slice split : pigs) {
                splits.add(new SliceWrapper(split, pigContext, groupBy, mapFuncs.get(i), i, fs));
            }
        }
        return splits.toArray(new SliceWrapper[splits.size()]);
    }

    public RecordReader<Text, Tuple> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
            throws IOException {
        activeSplit = (SliceWrapper) split;
        return ((SliceWrapper) split).makeReader(job);
    }

    public void configure(JobConf conf) {
    }

    public static SliceWrapper getActiveSplit() {
        return activeSplit;
    }

    private static SliceWrapper activeSplit;

    public void validateInput(JobConf arg0) throws IOException {
    }

}