org.archive.hadoop.mapreduce.LineDereferencingRecordReader.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.hadoop.mapreduce.LineDereferencingRecordReader.java

Source

/*
 *  This file is part of the Wayback archival access software
 *   (http://archive-access.sourceforge.net/projects/wayback/).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.hadoop.mapreduce;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

/**
 * RecordReader which reads pointers to actual files from an internal 
 * LineRecordReader, producing a LineRecordReader for the files pointed to by
 * the actual input.
 * 
 * @author brad
 *
 */
public class LineDereferencingRecordReader extends RecordReader<Text, Text> {
    LineRecordReader internal = new LineRecordReader();
    public static final Charset UTF8 = Charset.forName("UTF-8");

    protected static final String FORCE_COMPRESSED_FLAG = "line-reref.force-compressed";

    FileSystem fileSystem = null;
    Text key = null;
    Text value = null;
    BufferedReader curReader = null;
    String curFile = null;
    long curLine = 0;
    float progress = 0.0f;
    boolean forceCompressed = false;
    CompressionCodec codec;

    public static void forceCompressed(Configuration conf) {
        conf.setBoolean(FORCE_COMPRESSED_FLAG, true);
    }

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        forceCompressed = conf.getBoolean(FORCE_COMPRESSED_FLAG, false);

        CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
        codec = ccf.getCodecByClassName(GzipCodec.class.getName());

        FileSplit fileSplit = (FileSplit) split;
        fileSystem = fileSplit.getPath().getFileSystem(conf);
        internal.initialize(split, context);
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (key == null) {
            key = new Text();
        }
        if (value == null) {
            value = new Text();
        }
        while (true) {
            if (curReader == null) {
                // are there more?
                if (internal.nextKeyValue()) {
                    progress = internal.getProgress();
                    curFile = internal.getCurrentValue().toString();
                    Path path = new Path(curFile);
                    InputStream is = fileSystem.open(path);
                    if (forceCompressed || curFile.endsWith(".gz")) {
                        //                  is = new MultiMemberGZIPInputStream(is);
                        //                  is = new MultiMemberOpenJDKGZIPInputStream(is);
                        //                  is = new GZIPInputStream(is);               
                        is = codec.createInputStream(is);
                    }
                    curReader = new BufferedReader(new InputStreamReader(is, UTF8));

                } else {
                    // all done:
                    return false;
                }
            }
            // try to read another line:
            String nextLine = curReader.readLine();
            if (nextLine != null) {
                key.set(curFile + ":" + curLine);
                value.set(nextLine);
                curLine++;
                return true;
            }
            curReader = null;
            curFile = null;
            curLine = 0;
        }
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return key;
    }

    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        return value;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return progress;
    }

    @Override
    public void close() throws IOException {
        internal.close();
    }
}