com.linkedin.cubert.io.CombinedFileRecordReader.java Source code

Java tutorial

Introduction

Here is the source code for com.linkedin.cubert.io.CombinedFileRecordReader.java

Source

/* (c) 2014 LinkedIn Corp. All rights reserved.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use
 * this file except in compliance with the License. You may obtain a copy of the
 * License at  http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed
 * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied.
 */

package com.linkedin.cubert.io;

import java.io.IOException;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/**
 * Generic record reader for {@code CombineFileSplit}.
 * <p>
 * This record reader extracts individual {@code FileSplit} from the
 * {@code CombineFileSplit}, and creates record readers for each of the FileSplits. The
 * record readers are created sequentially (a new one is created when the current record
 * reader has exhausted all data).
 * 
 * @author Maneesh Varshney
 * 
 * @param <K>
 * @param <V>
 */
public class CombinedFileRecordReader<K, V> extends RecordReader<K, V> {
    private TaskAttemptContext context;
    private final InputFormat<K, V> inputFormat;
    private CombineFileSplit combineFileSplit;
    private RecordReader<K, V> current;
    private int currentFileIndex = 0;
    private float[] fractionLength;

    public CombinedFileRecordReader(InputFormat<K, V> inputFormat, CombineFileSplit combineFileSplit,
            TaskAttemptContext context) {
        this.inputFormat = inputFormat;
        this.combineFileSplit = combineFileSplit;
        this.context = context;

        long[] lengths = combineFileSplit.getLengths();
        long totalLength = 0;
        for (long length : lengths)
            totalLength += length;
        fractionLength = new float[lengths.length];
        for (int i = 0; i < lengths.length; i++)
            fractionLength[i] = ((float) lengths[i]) / totalLength;
    }

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        createNewRecordReader();
    }

    /**
     * Create new record record from the original InputFormat and initialize it.
     * 
     * @throws IOException
     * @throws InterruptedException
     */
    private void createNewRecordReader() throws IOException, InterruptedException {
        FileSplit split = new FileSplit(combineFileSplit.getPath(currentFileIndex),
                combineFileSplit.getOffset(currentFileIndex), combineFileSplit.getLength(currentFileIndex), null);

        if (split instanceof Configurable) {
            ((Configurable) split).setConf(context.getConfiguration());
        }

        current = inputFormat.createRecordReader(split, context);
        current.initialize(split, context);
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        boolean next = current.nextKeyValue();
        if (next)
            return next;

        currentFileIndex++;

        // need to create new reader
        if (currentFileIndex == combineFileSplit.getNumPaths())
            return false;

        createNewRecordReader();
        return nextKeyValue();
    }

    @Override
    public K getCurrentKey() throws IOException, InterruptedException {
        return current.getCurrentKey();
    }

    @Override
    public V getCurrentValue() throws IOException, InterruptedException {
        return current.getCurrentValue();
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        if (currentFileIndex >= combineFileSplit.getNumPaths())
            return (float) 1.0;

        float progress = 0;
        for (int i = 0; i < currentFileIndex - 1; i++)
            progress += fractionLength[i];
        progress += current.getProgress() * fractionLength[currentFileIndex];
        return progress;
    }

    @Override
    public void close() throws IOException {
        if (current != null)
            current.close();
        currentFileIndex = combineFileSplit.getPaths().length;
    }

}