Java tutorial
/** * Licensed to Big Data Genomics (BDG) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The BDG licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package brush; import java.io.EOFException; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; /** * This class is a Hadoop reader for "interleaved fastq" -- that is, * fastq with paired reads in the same file, interleaved, rather than * in two separate files. This makes it much easier to Hadoopily slice * up a single file and feed the slices into an aligner. * The format is the same as fastq, but records are expected to alternate * between /1 and /2. As a precondition, we assume that the interleaved * FASTQ files are always uncompressed; if the files are compressed, they * cannot be split, and thus there is no reason to use the interleaved * format. * * This reader is based on the FastqInputFormat that's part of Hadoop-BAM, * found at https://github.com/HadoopGenomics/Hadoop-BAM/blob/master/src/main/java/org/seqdoop/hadoop_bam/FastqInputFormat.java */ public class InterleavedFastqInputFormat extends FileInputFormat<Void, Text> { /** * A record reader for the interleaved FASTQ format. * * Reads over an input file and parses interleaved FASTQ read pairs into * a single Text output. This is then fed into the FastqConverter, which * converts the single Text instance into two AlignmentRecords. */ private static class InterleavedFastqRecordReader extends FastqRecordReader { InterleavedFastqRecordReader(final Configuration conf, final FileSplit split) throws IOException { super(conf, split); } /** * Checks to see whether the buffer is positioned at a valid record. * * We are properly positioned if the buffer contains a read name (starts * with '@'), and this read name has a first-of-pair suffix (ends with * '/1'). * * @param bufferLength The length of the line currently in the buffer. * @param buffer A buffer containing a peek at the first line in the current * stream. * @return Returns true if the buffer contains the first line of a properly * formatted pair of FASTQ records. */ protected boolean checkBuffer(int bufferLength, Text buffer) { return (bufferLength >= 2 && buffer.getBytes()[0] == '@' && buffer.getBytes()[bufferLength - 2] == '/' && buffer.getBytes()[bufferLength - 1] == '1'); } /** * Reads a read pair from the input split. * * @param value Text record to write input value into. * @return Returns whether this read was successful or not. * * @throws RuntimeException Throws exception if we hit an EOF in the * middle of a read, or if we have a read that is incorrectly * formatted (missing readname delimiters). */ protected boolean next(Text value) throws IOException { if (pos >= end) return false; // past end of slice try { Text readName1 = new Text(); Text readName2 = new Text(); value.clear(); // first read of the pair boolean gotData = lowLevelFastqRead(readName1, value); if (!gotData) return false; // second read of the pair gotData = lowLevelFastqRead(readName2, value); if (!gotData) return false; return true; } catch (EOFException e) { throw new RuntimeException("unexpected end of file in fastq record at " + makePositionMessage()); } } } /** * Creates the new record reader that underlies this input format. * * @param genericSplit The split that the record reader should read. * @param context The Hadoop task context. * @return Returns the interleaved FASTQ record reader. */ public RecordReader<Void, Text> createRecordReader(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { context.setStatus(genericSplit.toString()); // cast as per example in TextInputFormat return new InterleavedFastqRecordReader(context.getConfiguration(), (FileSplit) genericSplit); } }