libra.common.hadoop.io.reader.fasta.FastaKmerReader.java Source code

Java tutorial

Introduction

Here is the source code for libra.common.hadoop.io.reader.fasta.FastaKmerReader.java

Source

/*
 * Copyright 2016 iychoi.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package libra.common.hadoop.io.reader.fasta;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import libra.common.fasta.FastaRawRead;
import libra.common.fasta.FastaRawReadLine;
import libra.common.hadoop.io.format.fasta.FastaKmerInputFormat;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;

/**
 *
 * @author iychoi
 */
public class FastaKmerReader extends RecordReader<LongWritable, Text> {

    private static final Log LOG = LogFactory.getLog(FastaKmerReader.class);

    public static final char READ_DELIMITER = '>';

    private int kmersize = 0;
    private CompressionCodecFactory compressionCodecs = null;
    private long start;
    private long pos;
    private long end;
    private LineReader in;
    private int maxLineLength;
    private LongWritable key;
    private Text value;
    private Text buffer;
    private Text tempLine;
    private boolean isCompressed;
    private long uncompressedSize;

    @Override
    public LongWritable getCurrentKey() throws IOException, InterruptedException {
        return this.key;
    }

    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        return this.value;
    }

    @Override
    public void initialize(InputSplit genericSplit, TaskAttemptContext context)
            throws IOException, InterruptedException {
        FileSplit split = (FileSplit) genericSplit;
        Configuration conf = context.getConfiguration();
        this.kmersize = FastaKmerInputFormat.getKmerSize(conf);
        this.maxLineLength = conf.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
        this.start = split.getStart();
        this.end = this.start + split.getLength();
        final Path file = split.getPath();
        this.compressionCodecs = new CompressionCodecFactory(conf);
        final CompressionCodec codec = this.compressionCodecs.getCodec(file);

        // open the file and seek to the start of the split
        FileSystem fs = file.getFileSystem(conf);

        // get uncompressed length
        if (codec instanceof GzipCodec) {
            this.isCompressed = true;

            FSDataInputStream fileInCheckSize = fs.open(file);
            byte[] len = new byte[4];
            try {
                LOG.info("compressed input : " + file.getName());
                LOG.info("compressed file size : " + this.end);
                fileInCheckSize.skip(this.end - 4);
                IOUtils.readFully(fileInCheckSize, len, 0, len.length);
                this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0];
                if (this.uncompressedSize < 0) {
                    this.uncompressedSize = this.end;
                }
                LOG.info("uncompressed file size : " + this.uncompressedSize);
            } finally {
                fileInCheckSize.close();
            }

            this.end = Long.MAX_VALUE;
        } else if (codec != null) {
            this.isCompressed = true;
            this.end = Long.MAX_VALUE;
            this.uncompressedSize = Long.MAX_VALUE;
        } else {
            this.isCompressed = false;
        }

        // get inputstream
        FSDataInputStream fileIn = fs.open(file);
        boolean inTheMiddle = false;
        if (codec != null) {
            this.in = new LineReader(codec.createInputStream(fileIn), conf);
        } else {
            if (this.start != 0) {
                this.start--;
                fileIn.seek(this.start);

                inTheMiddle = true;
            }
            this.in = new LineReader(fileIn, conf);
        }

        this.buffer = new Text();

        if (inTheMiddle) {
            // find new start line
            this.start += this.in.readLine(new Text(), 0,
                    (int) Math.min((long) Integer.MAX_VALUE, this.end - this.start));

            // back off
            FSDataInputStream fileIn2 = fs.open(file);
            fileIn2.seek(this.start - 1000);

            LineReader in2 = new LineReader(fileIn2, conf);
            Text tempLine = new Text();
            long curpos = this.start - 1000;
            while (curpos < this.start) {
                curpos += in2.readLine(tempLine, 0, (int) (this.start - curpos));
            }

            if (tempLine.charAt(0) == READ_DELIMITER) {
                // clean start
                this.buffer.clear();
            } else {
                // leave k-1 seq in the buffer
                String seq = tempLine.toString().trim();
                String left = seq.substring(seq.length() - this.kmersize + 1);
                this.buffer.set(left);
            }

            in2.close();
        }

        this.pos = this.start;

        this.key = null;
        this.value = null;
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (this.key == null) {
            this.key = new LongWritable();
        }
        this.key.set(this.pos);

        if (this.value == null) {
            this.value = new Text();
        }

        if (this.buffer.getLength() >= this.kmersize) {
            // yes we have k-mer seq in the buffer
            String bufferString = this.buffer.toString();
            this.value.set(bufferString.substring(0, this.kmersize));
            this.buffer.set(bufferString.substring(1));
            return true;
        }

        int newSize = 0;
        if (this.tempLine == null) {
            this.tempLine = new Text();
        }
        while (this.pos < this.end) {
            newSize = this.in.readLine(this.tempLine, this.maxLineLength,
                    (int) Math.max(Math.min(Integer.MAX_VALUE, this.end - this.pos), this.maxLineLength));
            if (newSize == 0) {
                // EOF
                break;
            }

            this.pos += newSize;
            if (newSize < this.maxLineLength) {
                break;
            }

            // line too long
            LOG.info("Skipped line of size " + newSize + " at pos " + (this.pos - newSize));
        }

        if (newSize == 0) {
            this.key = null;
            this.value = null;
            this.buffer = null;
            return false;
        } else {
            String bufferString = this.buffer.toString();
            String readString = this.tempLine.toString().trim();
            String newString = bufferString + readString;
            this.value.set(newString.substring(0, this.kmersize));
            this.buffer.set(newString.substring(1));
            return true;
        }
    }

    @Override
    public float getProgress() throws IOException {
        if (this.isCompressed) {
            if (this.start == this.uncompressedSize) {
                return 0.0f;
            } else {
                return Math.min(1.0f, (this.pos - this.start) / (float) (this.uncompressedSize - this.start));
            }
        } else {
            if (this.start == this.end) {
                return 0.0f;
            } else {
                return Math.min(1.0f, (this.pos - this.start) / (float) (this.end - this.start));
            }
        }
    }

    @Override
    public void close() throws IOException {
        if (this.in != null) {
            this.in.close();
        }
    }
}