Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tez.mapreduce.processor.reduce; import java.io.IOException; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.mapred.Counters.Counter; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobContext; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.util.Progress; import org.apache.hadoop.util.Progressable; import org.apache.hadoop.util.ReflectionUtils; import org.apache.tez.common.counters.TaskCounter; import org.apache.tez.dag.api.TezException; import org.apache.tez.mapreduce.output.MROutputLegacy; import org.apache.tez.mapreduce.processor.MRTask; import org.apache.tez.mapreduce.processor.MRTaskReporter; import org.apache.tez.runtime.api.Event; import org.apache.tez.runtime.api.Input; import org.apache.tez.runtime.api.LogicalInput; import org.apache.tez.runtime.api.LogicalOutput; import org.apache.tez.runtime.api.ProcessorContext; import org.apache.tez.runtime.library.api.KeyValueWriter; import org.apache.tez.runtime.library.api.KeyValuesReader; import org.apache.tez.runtime.library.common.ConfigUtils; import org.apache.tez.runtime.library.common.sort.impl.TezRawKeyValueIterator; import org.apache.tez.runtime.library.input.OrderedGroupedInputLegacy; import org.apache.tez.runtime.library.output.OrderedPartitionedKVOutput; @Private @SuppressWarnings({ "unchecked", "rawtypes" }) public class ReduceProcessor extends MRTask { private static final Log LOG = LogFactory.getLog(ReduceProcessor.class); private Counter reduceInputKeyCounter; private Counter reduceInputValueCounter; public ReduceProcessor(ProcessorContext processorContext) { super(processorContext, false); } @Override public void handleEvents(List<Event> processorEvents) { // TODO Auto-generated method stub } public void close() throws IOException { // TODO Auto-generated method stub } @Override public void run(Map<String, LogicalInput> inputs, Map<String, LogicalOutput> outputs) throws Exception { LOG.info("Running reduce: " + processorContext.getUniqueIdentifier()); if (outputs.size() <= 0 || outputs.size() > 1) { throw new IOException("Invalid number of outputs" + ", outputCount=" + outputs.size()); } if (inputs.size() <= 0 || inputs.size() > 1) { throw new IOException("Invalid number of inputs" + ", inputCount=" + inputs.size()); } LogicalInput in = inputs.values().iterator().next(); in.start(); List<Input> pendingInputs = new LinkedList<Input>(); pendingInputs.add(in); processorContext.waitForAllInputsReady(pendingInputs); LOG.info("Input is ready for consumption. Starting Output"); LogicalOutput out = outputs.values().iterator().next(); out.start(); initTask(out); this.statusUpdate(); Class keyClass = ConfigUtils.getIntermediateInputKeyClass(jobConf); Class valueClass = ConfigUtils.getIntermediateInputValueClass(jobConf); LOG.info("Using keyClass: " + keyClass); LOG.info("Using valueClass: " + valueClass); RawComparator comparator = ConfigUtils.getInputKeySecondaryGroupingComparator(jobConf); LOG.info("Using comparator: " + comparator); reduceInputKeyCounter = mrReporter.getCounter(TaskCounter.REDUCE_INPUT_GROUPS); reduceInputValueCounter = mrReporter.getCounter(TaskCounter.REDUCE_INPUT_RECORDS); // Sanity check if (!(in instanceof OrderedGroupedInputLegacy)) { throw new IOException("Illegal input to reduce: " + in.getClass()); } OrderedGroupedInputLegacy shuffleInput = (OrderedGroupedInputLegacy) in; KeyValuesReader kvReader = shuffleInput.getReader(); KeyValueWriter kvWriter = null; if ((out instanceof MROutputLegacy)) { kvWriter = ((MROutputLegacy) out).getWriter(); } else if ((out instanceof OrderedPartitionedKVOutput)) { kvWriter = ((OrderedPartitionedKVOutput) out).getWriter(); } else { throw new IOException("Illegal output to reduce: " + in.getClass()); } if (useNewApi) { try { runNewReducer(jobConf, mrReporter, shuffleInput, comparator, keyClass, valueClass, kvWriter); } catch (ClassNotFoundException cnfe) { throw new IOException(cnfe); } } else { runOldReducer(jobConf, mrReporter, kvReader, comparator, keyClass, valueClass, kvWriter); } done(); } void runOldReducer(JobConf job, final MRTaskReporter reporter, KeyValuesReader input, RawComparator comparator, Class keyClass, Class valueClass, final KeyValueWriter output) throws IOException, InterruptedException { Reducer reducer = ReflectionUtils.newInstance(job.getReducerClass(), job); // make output collector OutputCollector collector = new OutputCollector() { public void collect(Object key, Object value) throws IOException { output.write(key, value); } }; // apply reduce function try { ReduceValuesIterator values = new ReduceValuesIterator(input, reporter, reduceInputValueCounter); values.informReduceProgress(); while (values.more()) { reduceInputKeyCounter.increment(1); reducer.reduce(values.getKey(), values, collector, reporter); values.informReduceProgress(); } // Set progress to 1.0f if there was no exception, reporter.setProgress(1.0f); //Clean up: repeated in catch block below reducer.close(); //End of clean up. } catch (IOException ioe) { try { reducer.close(); } catch (IOException ignored) { } throw ioe; } } private static class ReduceValuesIterator<KEY, VALUE> implements Iterator<VALUE> { private Counter reduceInputValueCounter; private KeyValuesReader in; private Progressable reporter; private Object currentKey; private Iterator<Object> currentValues; public ReduceValuesIterator(KeyValuesReader in, Progressable reporter, Counter reduceInputValueCounter) throws IOException { this.reduceInputValueCounter = reduceInputValueCounter; this.in = in; this.reporter = reporter; } public boolean more() throws IOException { boolean more = in.next(); if (more) { currentKey = in.getCurrentKey(); currentValues = in.getCurrentValues().iterator(); } else { currentKey = null; currentValues = null; } return more; } public KEY getKey() throws IOException { return (KEY) currentKey; } public void informReduceProgress() { reporter.progress(); } @Override public boolean hasNext() { return currentValues.hasNext(); } @Override public VALUE next() { reduceInputValueCounter.increment(1); return (VALUE) currentValues.next(); } @Override public void remove() { throw new UnsupportedOperationException(); } } void runNewReducer(JobConf job, final MRTaskReporter reporter, OrderedGroupedInputLegacy input, RawComparator comparator, Class keyClass, Class valueClass, final KeyValueWriter out) throws IOException, InterruptedException, ClassNotFoundException, TezException { // make a task context so we can get the classes org.apache.hadoop.mapreduce.TaskAttemptContext taskContext = getTaskAttemptContext(); // make a reducer org.apache.hadoop.mapreduce.Reducer reducer = (org.apache.hadoop.mapreduce.Reducer) ReflectionUtils .newInstance(taskContext.getReducerClass(), job); // wrap value iterator to report progress. final TezRawKeyValueIterator rawIter = input.getIterator(); TezRawKeyValueIterator rIter = new TezRawKeyValueIterator() { public void close() throws IOException { rawIter.close(); } public DataInputBuffer getKey() throws IOException { return rawIter.getKey(); } public Progress getProgress() { return rawIter.getProgress(); } @Override public boolean isSameKey() throws IOException { return rawIter.isSameKey(); } public DataInputBuffer getValue() throws IOException { return rawIter.getValue(); } public boolean next() throws IOException { boolean ret = rawIter.next(); reporter.setProgress(rawIter.getProgress().getProgress()); return ret; } }; org.apache.hadoop.mapreduce.RecordWriter trackedRW = new org.apache.hadoop.mapreduce.RecordWriter() { @Override public void write(Object key, Object value) throws IOException, InterruptedException { out.write(key, value); } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { } }; org.apache.hadoop.mapreduce.Reducer.Context reducerContext = createReduceContext(reducer, job, taskAttemptId, rIter, reduceInputKeyCounter, reduceInputValueCounter, trackedRW, committer, reporter, comparator, keyClass, valueClass); reducer.run(reducerContext); // Set progress to 1.0f if there was no exception, reporter.setProgress(1.0f); trackedRW.close(reducerContext); } @Override public void localizeConfiguration(JobConf jobConf) throws IOException, InterruptedException { super.localizeConfiguration(jobConf); jobConf.setBoolean(JobContext.TASK_ISMAP, false); } }