org.apache.pig.backend.hadoop.executionengine.mapreduceExec.PigCombine.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.pig.backend.hadoop.executionengine.mapreduceExec.PigCombine.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.backend.hadoop.executionengine.mapreduceExec;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Datum;
import org.apache.pig.data.IndexedTuple;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.eval.EvalSpec;
import org.apache.pig.impl.eval.collector.DataCollector;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.impl.util.ObjectSerializer;

public class PigCombine implements Reducer<Tuple, IndexedTuple, Tuple, IndexedTuple> {

    private final Log log = LogFactory.getLog(getClass());

    private JobConf job;
    private CombineDataOutputCollector finalout;
    private DataCollector evalPipe;
    private int index;
    private int inputCount;
    private DataBag bags[];
    private PigContext pigContext;
    private EvalSpec esp;

    public void reduce(Tuple key, Iterator<IndexedTuple> values, OutputCollector<Tuple, IndexedTuple> output,
            Reporter reporter) throws IOException {

        try {
            if (evalPipe == null) {
                finalout = new CombineDataOutputCollector(output);
                evalPipe = esp.setupPipe(null, finalout);
                //throw new RuntimeException("combine spec: " + evalSpec + " combine pipe: " + esp.toString());

                bags = new DataBag[inputCount];
                for (int i = 0; i < inputCount; i++) {
                    bags[i] = BagFactory.getInstance().newDefaultBag();
                }
            }

            if (PigInputFormat.getActiveSplit() == null) {
            } else {
                index = PigInputFormat.getActiveSplit().getIndex();
            }

            Datum groupName = key.getField(0);
            finalout.group = key;
            finalout.index = index;

            Tuple t = new Tuple(1 + inputCount);
            t.setField(0, groupName);
            for (int i = 1; i < 1 + inputCount; i++) {
                bags[i - 1].clear();
                t.setField(i, bags[i - 1]);
            }

            while (values.hasNext()) {
                IndexedTuple it = values.next();
                t.getBagField(it.index + 1).add(it.toTuple());
            }
            for (int i = 0; i < inputCount; i++) { // XXX: shouldn't we only do this if INNER flag is set?
                if (t.getBagField(1 + i).size() == 0)
                    return;
            }
            //          throw new RuntimeException("combine input: " + t.toString());
            evalPipe.add(t);
            // evalPipe.add(null); // EOF marker
        } catch (Throwable tr) {
            log.error(tr);
            RuntimeException exp = new RuntimeException(tr.getMessage());
            exp.setStackTrace(tr.getStackTrace());
            throw exp;
        }
    }

    /**
     * Just save off the PigJobConf for later use.
     */
    public void configure(JobConf job) {
        this.job = job;
        try {
            this.pigContext = (PigContext) ObjectSerializer.deserialize(job.get("pig.pigContext"));
            this.inputCount = ((ArrayList<FileSpec>) ObjectSerializer.deserialize(job.get("pig.inputs"))).size();
            String evalSpec = job.get("pig.combineFunc", "");
            this.esp = (EvalSpec) ObjectSerializer.deserialize(evalSpec);
            if (esp != null) {
                esp.instantiateFunc(pigContext);
            }
        } catch (IOException e) {
            throw new RuntimeException("unable to deserialize data", e);
        }
    }

    /**
     * Nothing happens here.
     */
    public void close() throws IOException {
    }

    private static class CombineDataOutputCollector extends DataCollector {
        OutputCollector<Tuple, IndexedTuple> oc = null;
        Tuple group = null;
        int index = -1;

        public CombineDataOutputCollector(OutputCollector oc) {
            super(null);
            this.oc = oc;
        }

        @Override
        public void add(Datum d) {
            if (d == null)
                return; // EOF marker from eval pipeline; ignore
            try {
                // oc.collect(group, new IndexedTuple(((Tuple)d).getTupleField(0),index));
                oc.collect(group, new IndexedTuple(((Tuple) d), index));
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }

}