Java tutorial
// Copyright (c) 2012 Aalto University // // This file is part of Hadoop-BAM. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal in the Software without restriction, including without limitation the // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or // sell copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. package fi.tkk.ics.hadoop.bam; import hbparquet.hadoop.util.ContextUtil; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; /** * Reads the FASTA reference sequence format. * Key: sequence description and position offset, delimited by ':' characters. * Value: a ReferenceFragment object representing the entry. * * Note: here sections in the input file are assumed to be delimited by single * line descriptions that start with '>'. */ public class FastaInputFormat extends FileInputFormat<Text, ReferenceFragment> { @Override public List<InputSplit> getSplits(JobContext job) throws IOException { // Note: We generate splits that correspond to different sections in the FASTA // input (which here are called "chromosomes", delimited by '>' and // followed by a single line description. // Some locality is preserved since the locations are formed from the input // splits, although no special attention is given to this issues (FASTA files // are assumed to be smallish). // The splits are generated on the client. In the future the split generation // should be only performed once and an index file stored inside HDFS for // peformance reasons. Currently this is not attempted (again: FASTA files // aren't all that big). // we first make sure we are given only a single file List<InputSplit> splits = super.getSplits(job); // first sort by input path Collections.sort(splits, new Comparator<InputSplit>() { public int compare(InputSplit a, InputSplit b) { FileSplit fa = (FileSplit) a, fb = (FileSplit) b; return fa.getPath().compareTo(fb.getPath()); } }); for (int i = 0; i < splits.size() - 1;) { FileSplit fa = (FileSplit) splits.get(i); FileSplit fb = (FileSplit) splits.get(i + 1); if (fa.getPath().compareTo(fb.getPath()) != 0) throw new IOException("FastaInputFormat assumes single FASTA input file!"); } // now we are sure we only have one FASTA input file final List<InputSplit> newSplits = new ArrayList<InputSplit>(splits.size()); FileSplit fileSplit = (FileSplit) splits.get(0); Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(ContextUtil.getConfiguration(job)); FSDataInputStream fis = fs.open(path); byte[] buffer = new byte[1024]; long byte_counter = 0; long prev_chromosome_byte_offset = 0; boolean first_chromosome = true; for (int j = 0; j < splits.size(); j++) { FileSplit origsplit = (FileSplit) splits.get(j); while (byte_counter < origsplit.getStart() + origsplit.getLength()) { long bytes_read = fis.read(byte_counter, buffer, 0, (int) Math.min(buffer.length, origsplit.getStart() + origsplit.getLength() - byte_counter)); //System.err.println("bytes_read: "+Integer.toString((int)bytes_read)+" of "+Integer.toString(splits.size())+" splits"); if (bytes_read > 0) { for (int i = 0; i < bytes_read; i++) { if (buffer[i] == (byte) '>') { //System.err.println("found chromosome at position "+Integer.toString((int)byte_counter+i)); if (!first_chromosome) { FileSplit fsplit = new FileSplit(path, prev_chromosome_byte_offset, byte_counter + i - 1 - prev_chromosome_byte_offset, origsplit.getLocations()); //System.err.println("adding split: start: "+Integer.toString((int)fsplit.getStart())+" length: "+Integer.toString((int)fsplit.getLength())); newSplits.add(fsplit); } first_chromosome = false; prev_chromosome_byte_offset = byte_counter + i; } } byte_counter += bytes_read; } } if (j == splits.size() - 1) { //System.err.println("EOF"); FileSplit fsplit = new FileSplit(path, prev_chromosome_byte_offset, byte_counter - prev_chromosome_byte_offset, origsplit.getLocations()); newSplits.add(fsplit); //conf)); //System.err.println("adding split: "+fsplit.toString()); break; } } return newSplits; } public static class FastaRecordReader extends RecordReader<Text, ReferenceFragment> { // start: first valid data index private long start; // end: first index value beyond the slice, i.e. slice is in range [start,end) private long end; // pos: current position in file private long pos; // file: the file being read private Path file; // current_split_pos: the current (chromosome) position within the split private int current_split_pos; // current_split_indexseq: the description/chromosome name private String current_split_indexseq = null; private LineReader lineReader; private InputStream inputStream; private Text currentKey = new Text(); private ReferenceFragment currentValue = new ReferenceFragment(); private Text buffer = new Text(); // How long can a FASTA line get? public static final int MAX_LINE_LENGTH = 20000; public FastaRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); current_split_pos = 1; FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException( "Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); } /* * Position the input stream at the start of the first record. */ private void positionAtFirstRecord(FSDataInputStream stream) throws IOException { if (start > 0) { stream.seek(start); } // we are now in a new chromosome/fragment, so read its name/index sequence // and reset position counter // index sequence LineReader reader = new LineReader(stream); int bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start)); current_split_indexseq = buffer.toString(); // now get rid of '>' character current_split_indexseq = current_split_indexseq.substring(1, current_split_indexseq.length()); // initialize position counter current_split_pos = 1; //System.err.println("read index sequence: "+current_split_indexseq); start = start + bytesRead; stream.seek(start); pos = start; } protected void setConf(Configuration conf) { } /** * Added to use mapreduce API. */ public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { } /** * Added to use mapreduce API. */ public Text getCurrentKey() { return currentKey; } /** * Added to use mapreduce API. */ public ReferenceFragment getCurrentValue() { return currentValue; } /** * Added to use mapreduce API. */ public boolean nextKeyValue() throws IOException, InterruptedException { return next(currentKey, currentValue); } /** * Close this RecordReader to future operations. */ public void close() throws IOException { inputStream.close(); } /** * Create an object of the appropriate type to be used as a key. */ public Text createKey() { return new Text(); } /** * Create an object of the appropriate type to be used as a value. */ public ReferenceFragment createValue() { return new ReferenceFragment(); } /** * Returns the current position in the input. */ public long getPos() { return pos; } /** * How much of the input has the RecordReader consumed i.e. */ public float getProgress() { if (start == end) return 1.0f; else return Math.min(1.0f, (pos - start) / (float) (end - start)); } public String makePositionMessage(long pos) { return file.toString() + ":" + pos; } public String makePositionMessage() { return file.toString() + ":" + pos; } /** * Reads the next key/value pair from the input for processing. */ public boolean next(Text key, ReferenceFragment value) throws IOException { if (pos >= end) return false; // past end of slice int bytesRead = lineReader.readLine(buffer, MAX_LINE_LENGTH); pos += bytesRead; if (bytesRead >= MAX_LINE_LENGTH) throw new RuntimeException("found abnormally large line (length " + bytesRead + ") at " + makePositionMessage(pos - bytesRead) + ": " + Text.decode(buffer.getBytes(), 0, 500)); else if (bytesRead <= 0) return false; // EOF else { scanFastaLine(buffer, key, value); current_split_pos += bytesRead; return true; } } private void scanFastaLine(Text line, Text key, ReferenceFragment fragment) { // Build the key. We concatenate the chromosome/fragment descripion and // the start position of the FASTA sequence line, replacing the tabs with colons. key.clear(); key.append(current_split_indexseq.getBytes(), 0, current_split_indexseq.getBytes().length); key.append(Integer.toString(current_split_pos).getBytes(), 0, Integer.toString(current_split_pos).getBytes().length); // replace tabs with : byte[] bytes = key.getBytes(); int temporaryEnd = key.getLength(); for (int i = 0; i < temporaryEnd; ++i) if (bytes[i] == '\t') bytes[i] = ':'; fragment.clear(); fragment.setPosition(current_split_pos); fragment.setIndexSequence(current_split_indexseq); fragment.getSequence().append(line.getBytes(), 0, line.getBytes().length); } } @Override public boolean isSplitable(JobContext context, Path path) { CompressionCodec codec = new CompressionCodecFactory(ContextUtil.getConfiguration(context)).getCodec(path); return codec == null; } public RecordReader<Text, ReferenceFragment> createRecordReader(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { context.setStatus(genericSplit.toString()); return new FastaRecordReader(ContextUtil.getConfiguration(context), (FileSplit) genericSplit); // cast as per example in TextInputFormat } }