edu.isi.mavuno.input.TextFileInputFormat.java Source code

Introduction

Here is the source code for edu.isi.mavuno.input.TextFileInputFormat.java
Source

/*
 * Mavuno: A Hadoop-Based Text Mining Toolkit
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0 
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package edu.isi.mavuno.input;

import java.io.DataInputStream;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class TextFileInputFormat extends FileInputFormat<LongWritable, TextDocument> {

    @Override
    public RecordReader<LongWritable, TextDocument> createRecordReader(InputSplit split,
            TaskAttemptContext context) {
        Configuration conf = context.getConfiguration();
        try {
            return new TextFileRecordReader((FileSplit) split, conf);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return false;
    }

    public static class TextFileRecordReader extends RecordReader<LongWritable, TextDocument> {

        private final LongWritable mCurKey = new LongWritable();
        private final TextDocument mCurValue = new TextDocument();

        private long mStart;
        private long mEnd;
        private long mPos;
        private long mRecordStartPos;

        private final String mFileName;

        private DataInputStream mInput = null;
        private final DataOutputBuffer mBuffer = new DataOutputBuffer();

        public TextFileRecordReader(FileSplit split, Configuration conf) throws IOException {
            mStart = split.getStart();

            Path file = split.getPath();
            mFileName = file.getName();

            CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
            CompressionCodec codec = compressionCodecs.getCodec(file);

            FileSystem fs = file.getFileSystem(conf);

            if (codec != null) {
                mInput = new DataInputStream(codec.createInputStream(fs.open(file)));
                mEnd = Long.MAX_VALUE;
            } else {
                FSDataInputStream fileIn = fs.open(file);

                fileIn.seek(mStart);
                mInput = fileIn;

                mEnd = mStart + split.getLength();
            }

            mRecordStartPos = mStart;

            // Because input streams of gzipped files are not seekable
            // (specifically, do not support getPos), we need to keep
            // track of bytes consumed ourselves.
            mPos = mStart;
        }

        @Override
        public void close() throws IOException {
            if (mInput != null) {
                mInput.close();
            }
        }

        @Override
        public float getProgress() throws IOException {
            return ((float) (mPos - mStart)) / ((float) (mEnd - mStart));
        }

        @Override
        public LongWritable getCurrentKey() throws IOException, InterruptedException {
            return mCurKey;
        }

        @Override
        public TextDocument getCurrentValue() throws IOException, InterruptedException {
            return mCurValue;
        }

        @Override
        public void initialize(InputSplit split, TaskAttemptContext context)
                throws IOException, InterruptedException {
            mBuffer.reset();
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            if (mInput == null) {
                return false;
            }

            while (mPos < mEnd) {
                int b = mInput.read();

                // increment position (bytes consumed)
                mPos++;

                // mEnd of file
                if (b == -1) {
                    break;
                }

                mBuffer.write(b);
            }

            mCurKey.set(mRecordStartPos);
            mCurValue.set(mFileName, new String(mBuffer.getData()));

            mInput.close();
            mInput = null;

            return true;
        }
    }
}