org.bgi.flexlab.gaea.data.mapreduce.input.bam.GaeaBamInputFormat.java Source code

Java tutorial

Introduction

Here is the source code for org.bgi.flexlab.gaea.data.mapreduce.input.bam.GaeaBamInputFormat.java

Source

/*******************************************************************************
 * Copyright (c) 2017, BGI-Shenzhen
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>
 *
 * This file incorporates work covered by the following copyright and 
 * Permission notices:
 *
 * Copyright (c) 2010 Aalto University 
 *
 *     Permission is hereby granted, free of charge, to any person
 *     obtaining a copy of this software and associated documentation
 *     files (the "Software"), to deal in the Software without
 *     restriction, including without limitation the rights to use,
 *     copy, modify, merge, publish, distribute, sublicense, and/or sell
 *     copies of the Software, and to permit persons to whom the
 *     Software is furnished to do so, subject to the following
 *     conditions:
 *  
 *     The above copyright notice and this permission notice shall be
 *     included in all copies or substantial portions of the Software.
 *  
 *     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 *     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 *     OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 *     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 *     HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 *     WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 *     OTHER DEALINGS IN THE SOFTWARE.
 *******************************************************************************/
package org.bgi.flexlab.gaea.data.mapreduce.input.bam;

import htsjdk.samtools.seekablestream.SeekableStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.bgi.flexlab.gaea.data.mapreduce.writable.SamRecordWritable;
import org.seqdoop.hadoop_bam.FileVirtualSplit;
import org.seqdoop.hadoop_bam.SplittingBAMIndex;
import org.seqdoop.hadoop_bam.util.WrapSeekable;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

public class GaeaBamInputFormat extends FileInputFormat<LongWritable, SamRecordWritable> {
    public static boolean DEBUG_BAM_SPLITTER = false;

    private Path getIdxPath(Path path) {
        return path.suffix(".splitting-bai");
    }

    public RecordReader<LongWritable, SamRecordWritable> createRecordReader(InputSplit split,
            TaskAttemptContext ctx) throws InterruptedException, IOException {
        RecordReader<LongWritable, SamRecordWritable> rr = new GaeaBamRecordReader();
        Configuration conf = ctx.getConfiguration();
        DEBUG_BAM_SPLITTER = conf.getBoolean("debug.bam.splitter", false);
        rr.initialize(split, ctx);
        return rr;
    }

    public List<InputSplit> getSplits(JobContext job) throws IOException {
        return getSplits(super.getSplits(job), job.getConfiguration());
    }

    @SuppressWarnings({ "unchecked", "rawtypes" })
    public List<InputSplit> getSplits(List<InputSplit> splits, Configuration cfg) throws IOException {
        Collections.sort(splits, new Comparator() {
            @SuppressWarnings("unused")
            public int compare(InputSplit a, InputSplit b) {
                FileSplit fa = (FileSplit) a;
                FileSplit fb = (FileSplit) b;
                return fa.getPath().compareTo(fb.getPath());
            }

            @Override
            public int compare(Object a, Object b) {
                FileSplit fa = (FileSplit) a;
                FileSplit fb = (FileSplit) b;
                return fa.getPath().compareTo(fb.getPath());
            }
        });
        List<InputSplit> newSplits = new ArrayList<InputSplit>(splits.size());

        for (int i = 0; i < splits.size();) {
            try {
                i = addIndexedSplits(splits, i, newSplits, cfg);
            } catch (IOException e) {
                i = addProbabilisticSplits(splits, i, newSplits, cfg);
            }
        }
        return newSplits;
    }

    private int addIndexedSplits(List<InputSplit> splits, int i, List<InputSplit> newSplits, Configuration cfg)
            throws IOException {
        Path file = ((FileSplit) splits.get(i)).getPath();

        SplittingBAMIndex idx = new SplittingBAMIndex(file.getFileSystem(cfg).open(getIdxPath(file)));

        int splitsEnd = splits.size();
        for (int j = i; j < splitsEnd; j++) {
            if (!file.equals(((FileSplit) splits.get(j)).getPath()))
                splitsEnd = j;
        }
        for (int j = i; j < splitsEnd; j++) {
            FileSplit fileSplit = (FileSplit) splits.get(j);

            long start = fileSplit.getStart();
            long end = start + fileSplit.getLength();

            Long blockStart = idx.nextAlignment(start);

            Long blockEnd = Long.valueOf(j == splitsEnd - 1 ? idx.prevAlignment(end).longValue() | 0xFFFF
                    : idx.nextAlignment(end).longValue());

            if (blockStart == null) {
                throw new RuntimeException("Internal error or invalid index: no block start for " + start);
            }
            if (blockEnd == null) {
                throw new RuntimeException("Internal error or invalid index: no block end for " + end);
            }
            newSplits.add(new FileVirtualSplit(file, blockStart.longValue(), blockEnd.longValue(),
                    fileSplit.getLocations()));
        }
        return splitsEnd;
    }

    private int addProbabilisticSplits(List<InputSplit> splits, int i, List<InputSplit> newSplits,
            Configuration cfg) throws IOException {
        Path path = ((FileSplit) splits.get(i)).getPath();
        SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(cfg), path);

        GaeaBamSplitGuesser guesser = new GaeaBamSplitGuesser(sin, cfg);

        FileVirtualSplit previousSplit = null;

        for (; i < splits.size(); i++) {
            FileSplit fspl = (FileSplit) splits.get(i);
            if (!fspl.getPath().equals(path)) {
                break;
            }
            long beg = fspl.getStart();
            long end = beg + fspl.getLength();

            long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);

            long alignedEnd = end << 16 | 0xFFFF;

            if (alignedBeg == end) {
                if (previousSplit == null) {
                    System.err.println(
                            "'" + path + "': " + "no reads in first split: bad BAM file or tiny split size?");
                } else {
                    previousSplit.setEndVirtualOffset(alignedEnd);
                }
            } else
                newSplits.add(
                        previousSplit = new FileVirtualSplit(path, alignedBeg, alignedEnd, fspl.getLocations()));

        }

        sin.close();
        return i;
    }

    public boolean isSplitable(JobContext job, Path path) {
        return true;
    }

}