com.inmobi.conduit.distcp.tools.mapred.lib.DynamicRecordReader.java Source code

Java tutorial

Introduction

Here is the source code for com.inmobi.conduit.distcp.tools.mapred.lib.DynamicRecordReader.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.inmobi.conduit.distcp.tools.mapred.lib;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.conf.Configuration;

import com.inmobi.conduit.distcp.tools.DistCpConstants;
import com.inmobi.conduit.distcp.tools.util.DistCpUtils;
import com.inmobi.conduit.distcp.tools.util.HadoopCompat;

import java.io.IOException;
import java.util.concurrent.TimeUnit;

/**
 * The DynamicRecordReader is used in conjunction with the DynamicInputFormat
 * to implement the "Worker pattern" for DistCp.
 * The DynamicRecordReader is responsible for:
 * 1. Presenting the contents of each chunk to DistCp's mapper.
 * 2. Acquiring a new chunk when the current chunk has been completely consumed,
 *    transparently.
 */
public class DynamicRecordReader<K, V> extends RecordReader<K, V> {
    private static final Log LOG = LogFactory.getLog(DynamicRecordReader.class);
    private TaskAttemptContext taskAttemptContext;
    private Configuration configuration;
    private DynamicInputChunkSet chunkSet;
    private DynamicInputChunk<K, V> chunk;
    private TaskID taskId;

    // Data required for progress indication.
    private final int numRecordsPerChunk; // Constant per job.
    private int totalNumRecords; // Constant per job.
    private int numRecordsProcessedByThisMap = 0;
    private long timeOfLastChunkDirScan = 0;
    private boolean isChunkDirAlreadyScanned = false;

    private static long TIME_THRESHOLD_FOR_DIR_SCANS = TimeUnit.MINUTES.toMillis(5);

    public DynamicRecordReader(int numEntriesPerChunk) {
        numRecordsPerChunk = numEntriesPerChunk;
    }

    /**
     * Implementation for RecordReader::initialize(). Initializes the internal
     * RecordReader to read from chunks.
     * @param inputSplit: The InputSplit for the map. Ignored entirely.
     * @param taskAttemptContext: The AttemptContext.
     * @throws IOException, on failure.
     * @throws InterruptedException
     */
    @Override
    public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
            throws IOException, InterruptedException {
        this.taskAttemptContext = taskAttemptContext;
        configuration = HadoopCompat.getTaskConfiguration(taskAttemptContext);
        taskId = HadoopCompat.getTaskAttemptID(taskAttemptContext).getTaskID();
        chunkSet = new DynamicInputChunkSet(HadoopCompat.getTaskConfiguration(taskAttemptContext));
        chunk = chunkSet.acquire(this.taskAttemptContext);
        timeOfLastChunkDirScan = System.currentTimeMillis();
        isChunkDirAlreadyScanned = false;

        totalNumRecords = getTotalNumRecords();

    }

    private int getTotalNumRecords() {
        return DistCpUtils.getInt(configuration, DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS);
    }

    /**
     * Implementation of RecordReader::nextValue().
     * Reads the contents of the current chunk and returns them. When a chunk has
     * been completely exhausted, an new chunk is acquired and read,
     * transparently.
     * @return True, if the nextValue() could be traversed to. False, otherwise.
     * @throws IOException, on failure.
     * @throws InterruptedException
     */
    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {

        if (chunk == null) {
            if (LOG.isDebugEnabled())
                LOG.debug(taskId + ": RecordReader is null. No records to be read.");
            return false;
        }

        if (chunk.getReader().nextKeyValue()) {
            ++numRecordsProcessedByThisMap;
            return true;
        }

        if (LOG.isDebugEnabled())
            LOG.debug(taskId + ": Current chunk exhausted. " + " Attempting to pick up new one.");

        chunk.release();
        timeOfLastChunkDirScan = System.currentTimeMillis();
        isChunkDirAlreadyScanned = false;

        chunk = chunkSet.acquire(taskAttemptContext);

        if (chunk == null)
            return false;

        if (chunk.getReader().nextKeyValue()) {
            ++numRecordsProcessedByThisMap;
            return true;
        } else {
            return false;
        }
    }

    /**
     * Implementation of RecordReader::getCurrentKey().
     * @return The key of the current record. (i.e. the source-path.)
     * @throws IOException, on failure.
     * @throws InterruptedException
     */
    @Override
    public K getCurrentKey() throws IOException, InterruptedException {
        return chunk.getReader().getCurrentKey();
    }

    /**
     * Implementation of RecordReader::getCurrentValue().
     * @return The value of the current record. (i.e. the target-path.)
     * @throws IOException, on failure.
     * @throws InterruptedException
     */
    @Override
    public V getCurrentValue() throws IOException, InterruptedException {
        return chunk.getReader().getCurrentValue();
    }

    /**
     * Implementation of RecordReader::getProgress().
     * @return A fraction [0.0,1.0] indicating the progress of a DistCp mapper.
     * @throws IOException, on failure.
     * @throws InterruptedException
     */
    @Override
    public float getProgress() throws IOException, InterruptedException {
        final int numChunksLeft = getNumChunksLeft();
        if (numChunksLeft < 0) {// Un-initialized. i.e. Before 1st dir-scan.
            //      assert numRecordsProcessedByThisMap <= numRecordsPerChunk
            //              : "numRecordsProcessedByThisMap:" + numRecordsProcessedByThisMap +
            //                " exceeds numRecordsPerChunk:" + numRecordsPerChunk;
            return ((float) numRecordsProcessedByThisMap) / totalNumRecords;
            // Conservative estimate, till the first directory scan.
        }

        return ((float) numRecordsProcessedByThisMap)
                / (numRecordsProcessedByThisMap + numRecordsPerChunk * numChunksLeft);
    }

    private int getNumChunksLeft() throws IOException {
        long now = System.currentTimeMillis();
        boolean tooLongSinceLastDirScan = now - timeOfLastChunkDirScan > TIME_THRESHOLD_FOR_DIR_SCANS;

        if (tooLongSinceLastDirScan || (!isChunkDirAlreadyScanned
                && numRecordsProcessedByThisMap % numRecordsPerChunk > numRecordsPerChunk / 2)) {
            chunkSet.getListOfChunkFiles();
            isChunkDirAlreadyScanned = true;
            timeOfLastChunkDirScan = now;
        }

        return chunkSet.getNumChunksLeft();
    }

    /**
     * Implementation of RecordReader::close().
     * Closes the RecordReader.
     * @throws IOException, on failure.
     */
    @Override
    public void close() throws IOException {
        if (chunk != null)
            chunk.close();
    }
}