kogiri.common.hadoop.io.reader.fasta.FastaReadDescriptionReader.java Source code

Java tutorial

Introduction

Here is the source code for kogiri.common.hadoop.io.reader.fasta.FastaReadDescriptionReader.java

Source

/*
 * Copyright (C) 2015 iychoi
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package kogiri.common.hadoop.io.reader.fasta;

import java.io.IOException;
import kogiri.common.fasta.FastaRead;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;

/**
 *
 * @author iychoi
 */
public class FastaReadDescriptionReader extends RecordReader<LongWritable, FastaRead> {

    private static final Log LOG = LogFactory.getLog(FastaReadDescriptionReader.class);

    public static final char READ_DELIMITER = '>';

    private CompressionCodecFactory compressionCodecs = null;
    private long start;
    private long pos;
    private long end;
    private LineReader in;
    private int maxLineLength;
    private String filename;
    private boolean hasNextRecord;
    private LongWritable key;
    private FastaRead value;
    private Text prevLine;
    private long prevSize;
    private boolean isCompressed;
    private long uncompressedSize;
    private boolean firstRead = true;

    @Override
    public LongWritable getCurrentKey() throws IOException, InterruptedException {
        return this.key;
    }

    @Override
    public FastaRead getCurrentValue() throws IOException, InterruptedException {
        return this.value;
    }

    @Override
    public void initialize(InputSplit genericSplit, TaskAttemptContext context)
            throws IOException, InterruptedException {

        FileSplit split = (FileSplit) genericSplit;
        Configuration job = context.getConfiguration();
        this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
        this.start = split.getStart();
        this.end = this.start + split.getLength();
        final Path file = split.getPath();
        this.compressionCodecs = new CompressionCodecFactory(job);
        final CompressionCodec codec = this.compressionCodecs.getCodec(file);

        this.filename = file.getName();

        this.firstRead = true;

        // open the file and seek to the start of the split
        FileSystem fs = file.getFileSystem(job);

        // get uncompressed length
        if (codec instanceof GzipCodec) {
            this.isCompressed = true;

            FSDataInputStream fileInCheckSize = fs.open(file);
            byte[] len = new byte[4];
            try {
                LOG.info("compressed input : " + file.getName());
                LOG.info("compressed file size : " + this.end);
                fileInCheckSize.skip(this.end - 4);
                IOUtils.readFully(fileInCheckSize, len, 0, len.length);
                this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0];
                if (this.uncompressedSize < 0) {
                    this.uncompressedSize = this.end;
                }
                LOG.info("uncompressed file size : " + this.uncompressedSize);
            } finally {
                fileInCheckSize.close();
            }

            this.end = Long.MAX_VALUE;
        } else if (codec != null) {
            this.isCompressed = true;
            this.end = Long.MAX_VALUE;
            this.uncompressedSize = Long.MAX_VALUE;
        } else {
            this.isCompressed = false;
        }

        // get inputstream
        FSDataInputStream fileIn = fs.open(file);

        if (codec != null) {
            this.in = new LineReader(codec.createInputStream(fileIn), job);
        } else {
            if (this.start != 0) {
                fileIn.seek(this.start);
            }
            this.in = new LineReader(fileIn, job);
        }

        // skip lines until we meet new record start
        while (this.start < this.end) {
            Text skipText = new Text();
            long newSize = this.in.readLine(skipText, this.maxLineLength,
                    Math.max((int) Math.min(Integer.MAX_VALUE, this.end - this.start), this.maxLineLength));
            if (newSize == 0) {
                // EOF
                this.hasNextRecord = false;
                this.pos = this.end;
                break;
            }

            if (skipText.getLength() > 0 && skipText.charAt(0) == READ_DELIMITER) {
                this.prevLine = skipText;
                this.prevSize = newSize;
                this.hasNextRecord = true;
                this.pos = this.start;
                break;
            }

            this.start += newSize;

            if (this.start >= this.end) {
                // EOF
                this.hasNextRecord = false;
                this.pos = this.end;
                break;
            }
        }

        this.key = null;
        this.value = null;
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        // seek to new record start
        if (this.hasNextRecord) {
            this.key = new LongWritable(this.pos);
            this.value = new FastaRead(this.filename);

            Text description = this.prevLine;
            this.pos += this.prevSize;

            long readStartOffset = this.key.get();
            long descriptionStartOffset = readStartOffset + 1;

            long sequenceStartOffset = this.pos;
            long descriptionLen = sequenceStartOffset - descriptionStartOffset;

            boolean foundNextRead = false;
            while (!foundNextRead) {
                Text newLine = new Text();
                long newSize = this.in.readLine(newLine, this.maxLineLength,
                        Math.max((int) Math.min(Integer.MAX_VALUE, this.end - this.pos), this.maxLineLength));
                if (newSize == 0) {
                    // EOF
                    this.prevLine = null;
                    this.prevSize = 0;
                    this.pos = this.end;
                    break;
                }

                if (newLine.getLength() > 0 && newLine.charAt(0) == READ_DELIMITER) {
                    this.prevLine = newLine;
                    this.prevSize = newSize;

                    if (this.pos + newSize < this.end) {
                        foundNextRead = true;
                    } else {
                        foundNextRead = false;
                    }
                    break;
                } else {
                    // skip
                }

                this.pos += newSize;
            }

            this.value.setReadOffset(readStartOffset);
            this.value.setDescription(description.toString());
            this.value.setSequence(null);
            if (this.firstRead) {
                this.value.setContinuousRead(false);
                this.firstRead = false;
            } else {
                this.value.setContinuousRead(true);
            }

            this.hasNextRecord = foundNextRead;
            return true;
        } else {
            this.pos = this.end;
            this.prevLine = null;
            this.prevSize = 0;
            this.key = null;
            this.value = null;
            this.hasNextRecord = false;
            return false;
        }
    }

    @Override
    public float getProgress() throws IOException {
        if (this.isCompressed) {
            if (this.start == this.uncompressedSize) {
                return 0.0f;
            } else {
                return Math.min(1.0f, (this.pos - this.start) / (float) (this.uncompressedSize - this.start));
            }
        } else {
            if (this.start == this.end) {
                return 0.0f;
            } else {
                return Math.min(1.0f, (this.pos - this.start) / (float) (this.end - this.start));
            }
        }
    }

    @Override
    public void close() throws IOException {
        if (this.in != null) {
            this.in.close();
        }
    }
}