org.apache.tez.mapreduce.output.MROutput.java Source code

Introduction

Here is the source code for org.apache.tez.mapreduce.output.MROutput.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tez.mapreduce.output;

import javax.annotation.Nullable;

import java.io.IOException;
import java.net.URI;
import java.text.NumberFormat;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;

import com.google.common.base.Preconditions;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileOutputCommitter;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobContext;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.tez.client.TezClientUtils;
import org.apache.tez.common.TezUtils;
import org.apache.tez.common.counters.TaskCounter;
import org.apache.tez.common.counters.TezCounter;
import org.apache.tez.dag.api.DataSinkDescriptor;
import org.apache.tez.dag.api.OutputCommitterDescriptor;
import org.apache.tez.dag.api.OutputDescriptor;
import org.apache.tez.dag.api.TezUncheckedException;
import org.apache.tez.dag.api.UserPayload;
import org.apache.tez.mapreduce.committer.MROutputCommitter;
import org.apache.tez.mapreduce.hadoop.MRConfig;
import org.apache.tez.mapreduce.hadoop.MRHelpers;
import org.apache.tez.mapreduce.hadoop.MRJobConfig;
import org.apache.tez.mapreduce.hadoop.mapred.MRReporter;
import org.apache.tez.mapreduce.hadoop.mapreduce.TaskAttemptContextImpl;
import org.apache.tez.mapreduce.processor.MRTaskReporter;
import org.apache.tez.runtime.api.AbstractLogicalOutput;
import org.apache.tez.runtime.api.Event;
import org.apache.tez.runtime.api.Output;
import org.apache.tez.runtime.api.OutputContext;
import org.apache.tez.runtime.library.api.KeyValueWriter;
import org.apache.tez.runtime.library.api.TezRuntimeConfiguration;

/**
 * {@link MROutput} is an {@link Output} which allows key/values pairs
 * to be written by a processor.
 *
 * It is compatible with all standard Apache Hadoop MapReduce 
 * OutputFormat implementations.
 * 
 * This class is not meant to be extended by external projects.
 */
@Public
public class MROutput extends AbstractLogicalOutput {

    /**
     * Helper class to configure {@link MROutput}
     *
     */
    public static class MROutputConfigBuilder {
        final Configuration conf;
        final Class<?> outputFormat;
        final boolean outputFormatProvided;
        boolean useNewApi;
        boolean getCredentialsForSinkFilesystem = true;
        String outputClassName = MROutput.class.getName();
        String outputPath;
        boolean doCommit = true;

        private MROutputConfigBuilder(Configuration conf, Class<?> outputFormatParam) {
            this.conf = conf;
            if (outputFormatParam != null) {
                outputFormatProvided = true;
                this.outputFormat = outputFormatParam;
                if (org.apache.hadoop.mapred.OutputFormat.class.isAssignableFrom(outputFormatParam)) {
                    useNewApi = false;
                } else if (org.apache.hadoop.mapreduce.OutputFormat.class.isAssignableFrom(outputFormatParam)) {
                    useNewApi = true;
                } else {
                    throw new TezUncheckedException("outputFormat must be assignable from either "
                            + "org.apache.hadoop.mapred.OutputFormat or "
                            + "org.apache.hadoop.mapreduce.OutputFormat" + " Given: "
                            + outputFormatParam.getName());
                }
            } else {
                outputFormatProvided = false;
                useNewApi = conf.getBoolean(MRJobConfig.NEW_API_REDUCER_CONFIG, true);
                try {
                    if (useNewApi) {
                        this.outputFormat = conf.getClassByName(conf.get(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR));
                        Preconditions.checkState(
                                org.apache.hadoop.mapreduce.OutputFormat.class.isAssignableFrom(this.outputFormat));
                    } else {
                        this.outputFormat = conf.getClassByName(conf.get("mapred.output.format.class"));
                        Preconditions.checkState(
                                org.apache.hadoop.mapred.OutputFormat.class.isAssignableFrom(this.outputFormat));
                    }
                } catch (ClassNotFoundException e) {
                    throw new TezUncheckedException(e);
                }
                initializeOutputPath();
            }
        }

        private MROutputConfigBuilder setOutputPath(String outputPath) {
            if (!(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.class.isAssignableFrom(outputFormat)
                    || FileOutputFormat.class.isAssignableFrom(outputFormat))) {
                throw new TezUncheckedException("When setting outputPath the outputFormat must "
                        + "be assignable from either org.apache.hadoop.mapred.FileOutputFormat or "
                        + "org.apache.hadoop.mapreduce.lib.output.FileOutputFormat. "
                        + "Otherwise use the non-path config builder." + " Given: " + outputFormat.getName());
            }
            conf.set(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.OUTDIR, outputPath);
            this.outputPath = outputPath;
            return this;
        }

        private void initializeOutputPath() {
            Preconditions.checkState(outputFormatProvided == false,
                    "Should only be invoked when no outputFormat is provided");
            if (org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.class.isAssignableFrom(outputFormat)
                    || FileOutputFormat.class.isAssignableFrom(outputFormat)) {
                outputPath = conf.get(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.OUTDIR);
            }
        }

        /**
         * Create the {@link DataSinkDescriptor}
         * @return {@link DataSinkDescriptor}
         */
        public DataSinkDescriptor build() {
            if (org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.class.isAssignableFrom(outputFormat)
                    || FileOutputFormat.class.isAssignableFrom(outputFormat)) {
                if (outputPath == null) {
                    throw new TezUncheckedException("OutputPaths must be specified for OutputFormats based on "
                            + org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.class.getName() + " or "
                            + FileOutputFormat.class.getName());
                }
            }
            Collection<URI> uris = null;
            if (getCredentialsForSinkFilesystem && outputPath != null) {
                try {
                    Path path = new Path(outputPath);
                    FileSystem fs;
                    fs = path.getFileSystem(conf);
                    Path qPath = fs.makeQualified(path);
                    uris = Collections.singletonList(qPath.toUri());
                } catch (IOException e) {
                    throw new TezUncheckedException(e);
                }
            }

            DataSinkDescriptor ds = DataSinkDescriptor.create(
                    OutputDescriptor.create(outputClassName).setUserPayload(createUserPayload()),
                    (doCommit ? OutputCommitterDescriptor.create(MROutputCommitter.class.getName()) : null), null);
            if (conf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT,
                    TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT_DEFAULT)) {
                ds.getOutputDescriptor().setHistoryText(TezUtils.convertToHistoryText(conf));
            }

            if (uris != null) {
                ds.addURIsForCredentials(uris);
            }
            return ds;
        }

        /**
         * Get the credentials for the output from its {@link FileSystem}s
         * Use the method to turn this off when not using a {@link FileSystem}
         * or when {@link Credentials} are not supported
         * @param value whether to get credentials or not. (true by default)
         * @return {@link org.apache.tez.mapreduce.output.MROutput.MROutputConfigBuilder}
         */
        public MROutputConfigBuilder getCredentialsForSinkFileSystem(boolean value) {
            getCredentialsForSinkFilesystem = value;
            return this;
        }

        /**
         * Disable commit operations for the output (default: true)
         * If the value is set to false then no {@link org.apache.tez.runtime.api.OutputCommitter} will
         * be specified for the output
         */
        public MROutputConfigBuilder setDoCommit(boolean value) {
            doCommit = value;
            return this;
        }

        MROutputConfigBuilder setOutputClassName(String outputClassName) {
            this.outputClassName = outputClassName;
            return this;
        }

        /**
         * Creates the user payload to be set on the OutputDescriptor for MROutput
         */
        private UserPayload createUserPayload() {
            if (outputFormatProvided) {
                conf.setBoolean(MRJobConfig.NEW_API_REDUCER_CONFIG, useNewApi);
                if (useNewApi) {
                    conf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, outputFormat.getName());
                } else {
                    conf.set("mapred.output.format.class", outputFormat.getName());
                }
            }
            MRHelpers.translateMRConfToTez(conf);
            try {
                return TezUtils.createUserPayloadFromConf(conf);
            } catch (IOException e) {
                throw new TezUncheckedException(e);
            }
        }
    }

    /**
     * Create an {@link org.apache.tez.mapreduce.output.MROutput.MROutputConfigBuilder} </p>
     * <p/>
     * The preferred usage model is to provide all of the parameters, and use methods to configure
     * the Output.
     * <p/>
     * For legacy applications, which may already have a fully configured {@link Configuration}
     * instance, the outputFormat can be specified as null
     *
     * @param conf         Configuration for the {@link MROutput}. This configuration instance will be
     *                     modified in place
     * @param outputFormat OutputFormat derived class. If the OutputFormat specified is
     *                     null, the provided configuration should be complete.
     * @return {@link org.apache.tez.mapreduce.output.MROutput.MROutputConfigBuilder}
     */
    public static MROutputConfigBuilder createConfigBuilder(Configuration conf, @Nullable Class<?> outputFormat) {
        return new MROutputConfigBuilder(conf, outputFormat);
    }

    /**
     * Create an {@link org.apache.tez.mapreduce.output.MROutput.MROutputConfigBuilder} for a {@link org.apache.hadoop.mapreduce.lib.output.FileOutputFormat}
     * or {@link org.apache.hadoop.mapred.FileOutputFormat} based OutputFormats.
     * <p/>
     * The preferred usage model is to provide all of the parameters, and use methods to configure the
     * Output.
     * <p/>
     * For legacy applications, which may already have a fully configured {@link Configuration}
     * instance, the outputFormat and outputPath can be specified as null
     *
     * @param conf         Configuration for the {@link MROutput}. This configuration instance will be
     *                     modified in place
     * @param outputFormat FileInputFormat derived class. If the InputFormat specified is
     *                     null, the provided configuration should be complete.
     * @param outputPath   Output path. This can be null if already setup in the configuration
     * @return {@link org.apache.tez.mapreduce.output.MROutput.MROutputConfigBuilder}
     */
    public static MROutputConfigBuilder createConfigBuilder(Configuration conf, @Nullable Class<?> outputFormat,
            @Nullable String outputPath) {
        MROutputConfigBuilder configurer = new MROutputConfigBuilder(conf, outputFormat);
        if (outputPath != null) {
            configurer.setOutputPath(outputPath);
        }
        return configurer;
    }

    private static final Log LOG = LogFactory.getLog(MROutput.class);

    private final NumberFormat taskNumberFormat = NumberFormat.getInstance();
    private final NumberFormat nonTaskNumberFormat = NumberFormat.getInstance();

    private JobConf jobConf;
    boolean useNewApi;
    private AtomicBoolean flushed = new AtomicBoolean(false);

    @SuppressWarnings("rawtypes")
    org.apache.hadoop.mapreduce.OutputFormat newOutputFormat;
    @SuppressWarnings("rawtypes")
    org.apache.hadoop.mapreduce.RecordWriter newRecordWriter;

    @SuppressWarnings("rawtypes")
    org.apache.hadoop.mapred.OutputFormat oldOutputFormat;
    @SuppressWarnings("rawtypes")
    org.apache.hadoop.mapred.RecordWriter oldRecordWriter;

    private TezCounter outputRecordCounter;

    private TaskAttemptContext newApiTaskAttemptContext;
    private org.apache.hadoop.mapred.TaskAttemptContext oldApiTaskAttemptContext;

    private boolean isMapperOutput;

    protected OutputCommitter committer;

    public MROutput(OutputContext outputContext, int numPhysicalOutputs) {
        super(outputContext, numPhysicalOutputs);
    }

    @Override
    public List<Event> initialize() throws IOException, InterruptedException {
        LOG.info("Initializing Simple Output");
        getContext().requestInitialMemory(0l, null); //mandatory call
        taskNumberFormat.setMinimumIntegerDigits(5);
        taskNumberFormat.setGroupingUsed(false);
        nonTaskNumberFormat.setMinimumIntegerDigits(3);
        nonTaskNumberFormat.setGroupingUsed(false);
        Configuration conf = TezUtils.createConfFromUserPayload(getContext().getUserPayload());
        this.jobConf = new JobConf(conf);
        // Add tokens to the jobConf - in case they are accessed within the RW / OF
        jobConf.getCredentials().mergeAll(UserGroupInformation.getCurrentUser().getCredentials());
        this.useNewApi = this.jobConf.getUseNewMapper();
        this.isMapperOutput = jobConf.getBoolean(MRConfig.IS_MAP_PROCESSOR, false);
        jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, getContext().getDAGAttemptNumber());
        TaskAttemptID taskAttemptId = org.apache.tez.mapreduce.hadoop.mapreduce.TaskAttemptContextImpl
                .createMockTaskAttemptID(getContext().getApplicationId().getClusterTimestamp(),
                        getContext().getTaskVertexIndex(), getContext().getApplicationId().getId(),
                        getContext().getTaskIndex(), getContext().getTaskAttemptNumber(), isMapperOutput);
        jobConf.set(JobContext.TASK_ATTEMPT_ID, taskAttemptId.toString());
        jobConf.set(JobContext.TASK_ID, taskAttemptId.getTaskID().toString());
        jobConf.setBoolean(JobContext.TASK_ISMAP, isMapperOutput);
        jobConf.setInt(JobContext.TASK_PARTITION, taskAttemptId.getTaskID().getId());
        jobConf.set(JobContext.ID, taskAttemptId.getJobID().toString());

        if (useNewApi) {
            // set the output part name to have a unique prefix
            if (jobConf.get("mapreduce.output.basename") == null) {
                jobConf.set("mapreduce.output.basename", getOutputFileNamePrefix());
            }
        }

        outputRecordCounter = getContext().getCounters().findCounter(TaskCounter.OUTPUT_RECORDS);

        if (useNewApi) {
            newApiTaskAttemptContext = createTaskAttemptContext(taskAttemptId);
            try {
                newOutputFormat = org.apache.hadoop.util.ReflectionUtils
                        .newInstance(newApiTaskAttemptContext.getOutputFormatClass(), jobConf);
            } catch (ClassNotFoundException cnfe) {
                throw new IOException(cnfe);
            }

            try {
                newRecordWriter = newOutputFormat.getRecordWriter(newApiTaskAttemptContext);
            } catch (InterruptedException e) {
                throw new IOException("Interrupted while creating record writer", e);
            }
        } else {
            oldApiTaskAttemptContext = new org.apache.tez.mapreduce.hadoop.mapred.TaskAttemptContextImpl(jobConf,
                    taskAttemptId, new MRTaskReporter(getContext()));
            oldOutputFormat = jobConf.getOutputFormat();

            FileSystem fs = FileSystem.get(jobConf);
            String finalName = getOutputName();

            oldRecordWriter = oldOutputFormat.getRecordWriter(fs, jobConf, finalName,
                    new MRReporter(getContext().getCounters()));
        }
        initCommitter(jobConf, useNewApi);

        LOG.info("Initialized Simple Output" + ", using_new_api: " + useNewApi);
        return null;
    }

    @Override
    public void start() {
    }

    public void initCommitter(JobConf job, boolean useNewApi) throws IOException, InterruptedException {

        if (useNewApi) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("using new api for output committer");
            }

            this.committer = newOutputFormat.getOutputCommitter(newApiTaskAttemptContext);
        } else {
            this.committer = job.getOutputCommitter();
        }

        Path outputPath = FileOutputFormat.getOutputPath(job);
        if (outputPath != null) {
            if ((this.committer instanceof FileOutputCommitter)) {
                FileOutputFormat.setWorkOutputPath(job,
                        ((FileOutputCommitter) this.committer).getTaskAttemptPath(oldApiTaskAttemptContext));
            } else {
                FileOutputFormat.setWorkOutputPath(job, outputPath);
            }
        }
        if (useNewApi) {
            this.committer.setupTask(newApiTaskAttemptContext);
        } else {
            this.committer.setupTask(oldApiTaskAttemptContext);
        }
    }

    public boolean isCommitRequired() throws IOException {
        if (useNewApi) {
            return committer.needsTaskCommit(newApiTaskAttemptContext);
        } else {
            return committer.needsTaskCommit(oldApiTaskAttemptContext);
        }
    }

    private TaskAttemptContext createTaskAttemptContext(TaskAttemptID attemptId) {
        return new TaskAttemptContextImpl(this.jobConf, attemptId, getContext().getCounters(), isMapperOutput,
                null);
    }

    private String getOutputFileNamePrefix() {
        String prefix = jobConf.get(MRJobConfig.MROUTPUT_FILE_NAME_PREFIX);
        if (prefix == null) {
            prefix = "part-v" + nonTaskNumberFormat.format(getContext().getTaskVertexIndex()) + "-o"
                    + nonTaskNumberFormat.format(getContext().getOutputIndex());
        }
        return prefix;
    }

    private String getOutputName() {
        // give a unique prefix to the output name
        return getOutputFileNamePrefix() + "-" + taskNumberFormat.format(getContext().getTaskIndex());
    }

    /**
     * Get a key value write to write Map Reduce compatible output
     */
    @Override
    public KeyValueWriter getWriter() throws IOException {
        return new KeyValueWriter() {
            private final boolean useNewWriter = useNewApi;

            @SuppressWarnings("unchecked")
            @Override
            public void write(Object key, Object value) throws IOException {
                if (useNewWriter) {
                    try {
                        newRecordWriter.write(key, value);
                    } catch (InterruptedException e) {
                        Thread.currentThread().interrupt();
                        throw new IOException("Interrupted while writing next key-value", e);
                    }
                } else {
                    oldRecordWriter.write(key, value);
                }
                outputRecordCounter.increment(1);
            }
        };
    }

    @Override
    public void handleEvents(List<Event> outputEvents) {
        // Not expecting any events at the moment.
    }

    @Override
    public synchronized List<Event> close() throws IOException {
        flush();
        return null;
    }

    /**
     * Call this in the processor before finishing to ensure outputs that 
     * outputs have been flushed. Must be called before commit.
     * @throws IOException
     */
    public void flush() throws IOException {
        if (flushed.getAndSet(true)) {
            return;
        }

        LOG.info("Flushing Simple Output");
        if (useNewApi) {
            try {
                newRecordWriter.close(newApiTaskAttemptContext);
            } catch (InterruptedException e) {
                throw new IOException("Interrupted while closing record writer", e);
            }
        } else {
            oldRecordWriter.close(null);
        }
        LOG.info("Flushed Simple Output");
    }

    /**
     * MROutput expects that a Processor call commit prior to the
     * Processor's completion
     * @throws IOException
     */
    public void commit() throws IOException {
        flush();
        if (useNewApi) {
            committer.commitTask(newApiTaskAttemptContext);
        } else {
            committer.commitTask(oldApiTaskAttemptContext);
        }
    }

    /**
     * MROutput expects that a Processor call abort in case of any error
     * ( including an error during commit ) prior to the Processor's completion
     * @throws IOException
     */
    public void abort() throws IOException {
        flush();
        if (useNewApi) {
            committer.abortTask(newApiTaskAttemptContext);
        } else {
            committer.abortTask(oldApiTaskAttemptContext);
        }
    }

}