ml.shifu.guagua.mapreduce.GuaguaMRRecordReader.java Source code

Java tutorial

Introduction

Here is the source code for ml.shifu.guagua.mapreduce.GuaguaMRRecordReader.java

Source

/*
 * Copyright [2013-2014] PayPal Software Foundation
 *  
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *  
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package ml.shifu.guagua.mapreduce;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

/**
 * {@link GuaguaMRRecordReader} is used as a mock for mapreduce reader interface, not real reading data.
 * 
 * <p>
 * To update progress, {@link #currentIteration} and {@link #totalIterations} should be set. {@link #currentIteration}
 * only can be set in GuaguaMapper.run.
 * 
 * <p>
 * Why set {@link #currentIteration} to static? The reason is that currentIteration for task cannot be transferred to
 * {@link #GuaguaRecordReader} because of no API from MapperContext. So static field here is used to update current
 * iteration.
 * 
 * <p>
 * If {@link #currentIteration} is not set in each iteration. It can only start from 0. This progress update doesn't
 * work well for task fail-over(TODO).
 */
public class GuaguaMRRecordReader extends RecordReader<LongWritable, Text> {
    /** Singular key object */
    private static final LongWritable ONLY_KEY = new LongWritable(0);
    /** Single value object */
    private static final Text ONLY_VALUE = new Text("only value");

    /**
     * This parameter is used to calculate progress.
     */
    private final int totalIterations;

    /**
     * {@link #currentIteration} is set to static, the reason is that no interface to update currentIteration especially
     * task is failed.
     */
    private static int currentIteration;

    /**
     * Default constructor, {@link #totalIterations} is set to default 0.
     */
    public GuaguaMRRecordReader() {
        this(0);
    }

    /**
     * Constructor with {@link #totalIterations} setting.
     * 
     * @param totalIterations
     *            total iterations for such guagua job.
     */
    public GuaguaMRRecordReader(int totalIterations) {
        this.totalIterations = totalIterations;
    }

    @Override
    public void close() throws IOException {
        // currently no logic
    }

    /**
     * Each iteration {@code context.nextKeyValue} should be called, and currentIteration is updated, so the progress is
     * updated.
     */
    @Override
    public float getProgress() throws IOException {
        return currentIteration * 1.0f / this.totalIterations;
    }

    /**
     * This is a mock to hide Hadoop raw map iteration on map input key.
     */
    @Override
    public LongWritable getCurrentKey() throws IOException, InterruptedException {
        return ONLY_KEY;
    }

    /**
     * This is a mock to hide Hadoop raw map iteration on map input value.
     */
    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        return ONLY_VALUE;
    }

    @Override
    public void initialize(InputSplit inputSplit, TaskAttemptContext context)
            throws IOException, InterruptedException {
        // currently nothing to be initialized
    }

    /**
     * Update iteration number. This is called for each iteration once. It is used to update Hadoop job progress more
     * precisely.
     */
    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        return currentIteration <= this.totalIterations ? true : false;
    }

    /**
     * Should only be called in GuaguaMapper Progress callback.
     */
    public static void setCurrentIteration(int currentIteration) {
        GuaguaMRRecordReader.currentIteration = currentIteration;
    }

}