Java tutorial
/* * Copyright (C) 2015 iychoi * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ package kogiri.common.hadoop.io.reader.fasta; import java.io.IOException; import java.util.ArrayList; import java.util.List; import kogiri.common.fasta.FastaRawRead; import kogiri.common.fasta.FastaRawReadLine; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.util.LineReader; /** * * @author iychoi */ public class FastaRawReadReader extends RecordReader<LongWritable, FastaRawRead> { private static final Log LOG = LogFactory.getLog(FastaRawReadReader.class); public static final char READ_DELIMITER = '>'; private CompressionCodecFactory compressionCodecs = null; private long start; private long pos; private long end; private LineReader in; private int maxLineLength; private String filename; private boolean hasNextRead; private LongWritable key; private FastaRawRead value; private Text prevLine; private long prevSize; private boolean isCompressed; private long uncompressedSize; private boolean firstRead = true; @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { return this.key; } @Override public FastaRawRead getCurrentValue() throws IOException, InterruptedException { return this.value; } @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = this.start + split.getLength(); final Path file = split.getPath(); this.compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = this.compressionCodecs.getCodec(file); this.filename = file.getName(); this.firstRead = true; // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); // get uncompressed length if (codec instanceof GzipCodec) { this.isCompressed = true; FSDataInputStream fileInCheckSize = fs.open(file); byte[] len = new byte[4]; try { LOG.info("compressed input : " + file.getName()); LOG.info("compressed file size : " + this.end); fileInCheckSize.skip(this.end - 4); IOUtils.readFully(fileInCheckSize, len, 0, len.length); this.uncompressedSize = (len[3] << 24) | (len[2] << 16) | (len[1] << 8) | len[0]; if (this.uncompressedSize < 0) { this.uncompressedSize = this.end; } LOG.info("uncompressed file size : " + this.uncompressedSize); } finally { fileInCheckSize.close(); } this.end = Long.MAX_VALUE; } else if (codec != null) { this.isCompressed = true; this.end = Long.MAX_VALUE; this.uncompressedSize = Long.MAX_VALUE; } else { this.isCompressed = false; } // get inputstream FSDataInputStream fileIn = fs.open(file); if (codec != null) { this.in = new LineReader(codec.createInputStream(fileIn), job); } else { if (this.start != 0) { fileIn.seek(this.start); } this.in = new LineReader(fileIn, job); } // skip lines until we meet new read start while (this.start < this.end) { Text skipText = new Text(); long newSize = this.in.readLine(skipText, this.maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, this.end - this.start), this.maxLineLength)); if (newSize == 0) { // EOF this.hasNextRead = false; this.pos = this.end; break; } if (skipText.getLength() > 0 && skipText.charAt(0) == READ_DELIMITER) { this.prevLine = skipText; this.prevSize = newSize; this.hasNextRead = true; this.pos = this.start; break; } this.start += newSize; if (this.start >= this.end) { // EOF this.hasNextRead = false; this.pos = this.end; break; } } this.key = null; this.value = null; } @Override public boolean nextKeyValue() throws IOException, InterruptedException { // seek to new read start if (this.hasNextRead) { this.key = new LongWritable(this.pos); this.value = new FastaRawRead(this.filename); Text description = this.prevLine; this.pos += this.prevSize; long readStartOffset = this.key.get(); long descriptionStartOffset = readStartOffset + 1; long sequenceStartOffset = this.pos; long descriptionLen = sequenceStartOffset - descriptionStartOffset; List<String> sequences = new ArrayList<String>(); List<Long> sequenceStarts = new ArrayList<Long>(); boolean foundNextRead = false; while (!foundNextRead) { Text newLine = new Text(); long newSize = this.in.readLine(newLine, this.maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, this.end - this.pos), this.maxLineLength)); if (newSize == 0) { // EOF this.prevLine = null; this.prevSize = 0; this.pos = this.end; break; } if (newLine.getLength() > 0 && newLine.charAt(0) == READ_DELIMITER) { this.prevLine = newLine; this.prevSize = newSize; if (this.pos + newSize < this.end) { foundNextRead = true; } else { foundNextRead = false; } break; } else { sequences.add(newLine.toString()); sequenceStarts.add(this.pos); } this.pos += newSize; } long newReadStartOffset = this.pos; long readLen = newReadStartOffset - readStartOffset; long sequenceLen = newReadStartOffset - sequenceStartOffset; this.value.setReadOffset(readStartOffset); this.value.setDescriptionOffset(descriptionStartOffset); this.value.setSequenceOffset(sequenceStartOffset); this.value.setReadLen(readLen); this.value.setDescriptionLen(descriptionLen); this.value.setSequenceLen(sequenceLen); this.value.setDescription(description.toString()); if (this.firstRead) { this.value.setContinuousRead(false); this.firstRead = false; } else { this.value.setContinuousRead(true); } FastaRawReadLine[] readLines = new FastaRawReadLine[sequences.size()]; for (int i = 0; i < sequences.size(); i++) { readLines[i] = new FastaRawReadLine(sequenceStarts.get(i), sequences.get(i)); } this.value.setRawSequence(readLines); this.hasNextRead = foundNextRead; return true; } else { this.pos = this.end; this.prevLine = null; this.prevSize = 0; this.key = null; this.value = null; this.hasNextRead = false; return false; } } @Override public float getProgress() throws IOException { if (this.isCompressed) { if (this.start == this.uncompressedSize) { return 0.0f; } else { return Math.min(1.0f, (this.pos - this.start) / (float) (this.uncompressedSize - this.start)); } } else { if (this.start == this.end) { return 0.0f; } else { return Math.min(1.0f, (this.pos - this.start) / (float) (this.end - this.start)); } } } @Override public void close() throws IOException { if (this.in != null) { this.in.close(); } } }