tests.it.crs4.seal.common.TestSamInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for tests.it.crs4.seal.common.TestSamInputFormat.java

Source

// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal.  If not, see <http://www.gnu.org/licenses/>.

package tests.it.crs4.seal.common;

import it.crs4.seal.common.AlignOp;
import it.crs4.seal.common.ReadPair;
import it.crs4.seal.common.SamInputFormat;
import it.crs4.seal.common.SamInputFormat.SamRecordReader;
import it.crs4.seal.common.AbstractTaggedMapping;
import it.crs4.seal.common.Utils;

import org.junit.*;
import static org.junit.Assert.*;

import java.io.BufferedOutputStream;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.ByteBuffer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;

public class TestSamInputFormat {
    public static final String oneRecord = "ERR020229.100000/1   81   chr6   3558357   37   91M   =   3558678   400   AGCTTCTTTGACTCTCGAATTTTAGCACTAGAAGAAATAGTGAGGATTATATATTTCAGAAGTTCTCACCCAGGATATCAGAACACATTCA   5:CB:CCBCCB>:C@;BBBB??B;?>1@@=C=4ACCAB3A8=CC=C?CBC=CBCCCCCCCCCCCCC@5>?=?CAAB=3=>====5>=AC?C   XT:A:U   NM:i:0   SM:i:37   AM:i:0   X0:i:1   X1:i:0   XM:i:0   XO:i:0   XG:i:0   MD:Z:91";

    public static final String twoRecords = "Read/2   153   1   10018   25   101M   =   10018   0   CTAACCCTAACCCTAACCCTACCCCTAACCCTAACCCTTAACCTAACCCTAACCCTAACCCTTAACCTAACCCTAACCCTAACCCTAACCCAACCCTAACC   ##@@A;;4<2<=@.>.@7?5='?B;@B>EGEADAD?@<.>5@B<A0>>>:=>EE@EF@BGEF:ECFEAEDEEEDE>:=>:9AFEFDGFHHHHDHFHFHHHH"
            + "\n"
            + "Read/1   117   1   10018   0   *   =   10018   0   TTAGGGCTAGGGCTAGGGCTAGGGCTAGGGTTAGGGTTAGGGCTAGGGCTAGGGCTAGGGCTAGGGCTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGG   ##########DD=A2DBD><D:??.@D8DF>FBDFGDFFHHHADFFHEEFEEC=AFFCFF9GFEGECBGEFHHHGHHHHEHGHEHHHHGHHHHGHHGHHFH";

    public static final String unmapped = "UNMAPPED   93   *   *   0   *   =   3558678   *   AGCTT   5:CB:";

    private Configuration conf;
    private FileSplit split;
    private File tempFile;
    private File tempGz;

    private LongWritable key;
    private ReadPair pair;

    @Before
    public void setup() throws IOException {
        tempFile = File.createTempFile("test_sam_input_format", ".sam");
        tempGz = File.createTempFile("test_sam_input_format", ".gz");

        conf = new Configuration();

        key = null;
        pair = null;
    }

    @After
    public void tearDown() {
        tempFile.delete();
        tempGz.delete();
        split = null;
    }

    private void writeToTemp(String s) throws IOException {
        PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(tempFile)));
        out.write(s);
        out.close();
    }

    private SamRecordReader createReaderForOneRecord() throws IOException {
        writeToTemp(oneRecord);
        split = new FileSplit(new Path(tempFile.toURI().toString()), 0, oneRecord.length(), null);

        SamRecordReader reader = new SamRecordReader();
        reader.initialize(split, Utils.getTaskAttemptContext(conf));

        return reader;
    }

    @Test
    public void testReadFromStart() throws IOException, NoSuchFieldException {
        SamRecordReader reader = createReaderForOneRecord();

        assertEquals(0.0, reader.getProgress(), 0.01);

        boolean retval = reader.nextKeyValue();
        assertTrue(retval);
        assertEquals(new LongWritable(0), reader.getCurrentKey());

        pair = reader.getCurrentValue();

        AbstractTaggedMapping map = pair.getRead1();
        assertNotNull(map);
        assertNull(pair.getRead2());

        // test that the record has been read correctly
        assertEquals("ERR020229.100000/1", map.getName());
        assertEquals(81, map.getFlag());
        assertEquals("chr6", map.getContig());
        assertEquals(3558357, map.get5Position());
        assertEquals(37, map.getMapQ());
        assertEquals("91M", AlignOp.cigarStr(map.getAlignment()));
        assertTrue(map.isTemplateLengthAvailable());
        assertEquals(400, map.getTemplateLength());

        ByteBuffer buffer = map.getSequence();
        assertEquals("AGCTTCTTTGACTCTCGAATTTTAGCACTAGAAGAAATAGTGAGGATTATATATTTCAGAAGTTCTCACCCAGGATATCAGAACACATTCA",
                new String(buffer.array(), buffer.position(), (buffer.limit() - buffer.position())));

        buffer = map.getBaseQualities();
        assertEquals("5:CB:CCBCCB>:C@;BBBB??B;?>1@@=C=4ACCAB3A8=CC=C?CBC=CBCCCCCCCCCCCCC@5>?=?CAAB=3=>====5>=AC?C",
                new String(buffer.array(), buffer.position(), (buffer.limit() - buffer.position())));

        assertEquals(0, map.getIntTag("NM"));
        assertEquals(37, map.getIntTag("SM"));
        assertEquals(0, map.getIntTag("AM"));
        assertEquals(1, map.getIntTag("X0"));
        assertEquals(0, map.getIntTag("X1"));
        assertEquals(0, map.getIntTag("XM"));
        assertEquals(0, map.getIntTag("XO"));
        assertEquals(0, map.getIntTag("XG"));
        assertEquals("U", map.getTag("XT"));
        assertEquals("91", map.getTag("MD"));

        assertEquals(1.0, reader.getProgress(), 0.01);

        retval = reader.nextKeyValue();
        assertFalse(retval);
    }

    @Test
    public void testReadStartInMiddle() throws IOException {
        writeToTemp(twoRecords);
        split = new FileSplit(new Path(tempFile.toURI().toString()), 10, twoRecords.length() - 10, null);

        SamRecordReader reader = new SamRecordReader();
        reader.initialize(split, Utils.getTaskAttemptContext(conf));

        assertEquals(0.0, reader.getProgress(), 0.01);

        boolean retval = reader.nextKeyValue();
        assertTrue(retval);
        assertEquals(new LongWritable(241), reader.getCurrentKey());

        pair = reader.getCurrentValue();

        assertEquals("Read", pair.getName());
        assertNull(pair.getRead2());

        AbstractTaggedMapping map = pair.getAnyRead();
        assertEquals("Read/1", map.getName());

        assertEquals(1.0, reader.getProgress(), 0.01);

        retval = reader.nextKeyValue();
        assertFalse(retval);
    }

    @Test
    public void testSliceEndsBeforeEndOfFile() throws IOException {
        writeToTemp(twoRecords);
        // slice ends at position 10--i.e. somewhere in the first record.  The second record should not be read.
        split = new FileSplit(new Path(tempFile.toURI().toString()), 0, 10, null);

        SamRecordReader reader = new SamRecordReader();
        reader.initialize(split, Utils.getTaskAttemptContext(conf));

        boolean retval = reader.nextKeyValue();
        assertTrue(retval);
        assertEquals(new LongWritable(0), reader.getCurrentKey());

        assertFalse("SamRecordReader is reading a record that starts after the end of the slice",
                reader.nextKeyValue());
    }

    @Test
    public void testProgress() throws IOException {
        writeToTemp(twoRecords);
        split = new FileSplit(new Path(tempFile.toURI().toString()), 0, twoRecords.length(), null);

        SamRecordReader reader = new SamRecordReader();
        reader.initialize(split, Utils.getTaskAttemptContext(conf));

        assertEquals(0.0, reader.getProgress(), 0.01);

        reader.nextKeyValue();
        assertEquals(0.5, reader.getProgress(), 0.01);

        reader.nextKeyValue();
        assertEquals(1.0, reader.getProgress(), 0.01);
    }

    @Test
    public void testClose() throws IOException {
        SamRecordReader reader = createReaderForOneRecord();
        // doesn't really do anything but exercise the code
        reader.close();
    }

    @Test
    public void testGzCompressedInput() throws IOException {
        // write gzip-compressed data
        GzipCodec codec = new GzipCodec();
        PrintWriter out = new PrintWriter(
                new BufferedOutputStream(codec.createOutputStream(new FileOutputStream(tempGz))));
        out.write(twoRecords);
        out.close();

        // now try to read it
        split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoRecords.length(), null);

        SamRecordReader reader = new SamRecordReader();
        reader.initialize(split, Utils.getTaskAttemptContext(conf));

        boolean retval = reader.nextKeyValue();
        assertTrue(retval);
        assertEquals("Read/2", reader.getCurrentValue().getAnyRead().getName());

        retval = reader.nextKeyValue();
        assertTrue(retval);
        assertEquals("Read/1", reader.getCurrentValue().getAnyRead().getName());
    }

    @Test(expected = RuntimeException.class)
    public void testCompressedSplit() throws IOException {
        // write gzip-compressed data
        GzipCodec codec = new GzipCodec();
        PrintWriter out = new PrintWriter(
                new BufferedOutputStream(codec.createOutputStream(new FileOutputStream(tempGz))));
        out.write(twoRecords);
        out.close();

        // now try to read it starting from the middle

        SamInputFormat inputFormat = new SamInputFormat();

        split = new FileSplit(new Path(tempGz.toURI().toString()), 10, twoRecords.length(), null);
        RecordReader<LongWritable, ReadPair> reader = inputFormat.createRecordReader(split,
                Utils.getTaskAttemptContext(conf));
    }

    public static void main(String args[]) {
        org.junit.runner.JUnitCore.main(TestSamInputFormat.class.getName());
    }
}