alluxio.hadoop.mapreduce.KeyValueOutputCommitter.java Source code

Java tutorial

Introduction

Here is the source code for alluxio.hadoop.mapreduce.KeyValueOutputCommitter.java

Source

/*
 * The Alluxio Open Foundation licenses this work under the Apache License, version 2.0
 * (the "License"). You may not use this work except in compliance with the License, which is
 * available at www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied, as more fully set forth in the License.
 *
 * See the NOTICE file distributed with this work for information regarding copyright ownership.
 */

package alluxio.hadoop.mapreduce;

import alluxio.Constants;
import alluxio.annotation.PublicApi;
import alluxio.client.keyvalue.KeyValueSystem;
import alluxio.exception.AlluxioException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;

import javax.annotation.concurrent.ThreadSafe;

/**
 * Extension of {@link FileOutputCommitter} where creating, completing, or deleting a {@link
 * KeyValueSystem} in different phases of a job's or task's lifecycle is considered.
 * <p>
 * This committer must be used along with {@link KeyValueOutputFormat} to merge the key-value stores
 * created by each Reducer into one key-value store under the MapReduce output directory.
 */
@PublicApi
@ThreadSafe
public final class KeyValueOutputCommitter extends FileOutputCommitter {
    private static final Logger LOG = LoggerFactory.getLogger(Constants.LOGGER_TYPE);

    private static final KeyValueSystem KEY_VALUE_SYSTEM = KeyValueSystem.Factory.create();

    /**
     * Constructor.
     *
     * @param outputPath the job's output path, or null if the output committer is a noop
     * @param taskContext the task's context
     * @throws IOException if the construction fails
     */
    public KeyValueOutputCommitter(Path outputPath, TaskAttemptContext taskContext) throws IOException {
        super(outputPath, taskContext);
    }

    /**
     * @param taskContext MapReduce task configuration
     * @return true if the task output directory exists, otherwise false
     * @throws IOException if fails to determine whether the output directory exists
     */
    @Override
    public boolean needsTaskCommit(TaskAttemptContext taskContext) throws IOException {
        Path taskOutputPath = new Path(KeyValueOutputFormat.getTaskOutputURI(taskContext).toString());
        FileSystem fs = taskOutputPath.getFileSystem(taskContext.getConfiguration());
        return fs.exists(taskOutputPath);
    }

    /**
     * {@inheritDoc}
     * <p/>
     * Merges the completed key-value store under the task's temporary output directory to the
     * key-value store at job output directory, then calls {@link
     * FileOutputCommitter#commitTask(TaskAttemptContext)}.
     */
    @Override
    public void commitTask(TaskAttemptContext taskContext) throws IOException {
        try {
            KEY_VALUE_SYSTEM.mergeStore(KeyValueOutputFormat.getTaskOutputURI(taskContext),
                    KeyValueOutputFormat.getJobOutputURI(taskContext));
        } catch (AlluxioException e) {
            throw new IOException(e);
        }
        super.commitTask(taskContext);
    }

    /**
     * {@inheritDoc}
     * <p/>
     * Deletes the completed key-value stores under the task's temporary output directory, and then
     * calls {@link FileOutputCommitter#abortTask(TaskAttemptContext)}.
     */
    @Override
    public void abortTask(TaskAttemptContext taskContext) {
        // TODO(binfan): in Hadoop 1.x FileOutputCommitter#abortTask doesn't throw IOException. To
        // keep the code compile with early Hadoop versions, we catch this exception.
        try {
            try {
                KEY_VALUE_SYSTEM.deleteStore(KeyValueOutputFormat.getTaskOutputURI(taskContext));
            } catch (AlluxioException e) {
                throw new IOException(e);
            }
            super.abortTask(taskContext);
        } catch (IOException e) {
            LOG.error("Failed to abort task", taskContext);
        }
    }

    /**
     * @return the temp directory name
     */
    public static String getPendingDirName() {
        // Due to Hadoop 1 support we stick with the deprecated version. If we drop support for it
        // FileOutputCommitter.PENDING_DIR_NAME will be the new one.
        return FileOutputCommitter.TEMP_DIR_NAME;
    }
}