org.broadinstitute.sting.utils.activeregion.ActivityProfile.java Source code

Java tutorial

Introduction

Here is the source code for org.broadinstitute.sting.utils.activeregion.ActivityProfile.java

Source

/*
 * Copyright (c) 2012, The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

package org.broadinstitute.sting.utils.activeregion;

import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;

/**
 * Class holding information about per-base activity scores for the
 * active region traversal
 *
 * @author Mark DePristo
 * @since Date created
 */
public class ActivityProfile {
    final GenomeLocParser parser;
    final boolean presetRegions;
    GenomeLoc regionStartLoc = null;
    final List<Double> isActiveList;
    private GenomeLoc lastLoc = null;
    private static final int FILTER_SIZE = 65;
    private static final Double[] GaussianKernel;

    static {
        GaussianKernel = new Double[2 * FILTER_SIZE + 1];
        for (int iii = 0; iii < 2 * FILTER_SIZE + 1; iii++) {
            GaussianKernel[iii] = MathUtils.NormalDistribution(FILTER_SIZE, 40.0, iii);
        }
    }

    // todo -- add upfront the start and stop of the intervals
    // todo -- check that no regions are unexpectedly missing
    // todo -- add unit tests
    // TODO -- own preset regions
    public ActivityProfile(final GenomeLocParser parser, final boolean presetRegions) {
        this(parser, presetRegions, new ArrayList<Double>(), null);
    }

    protected ActivityProfile(final GenomeLocParser parser, final boolean presetRegions,
            final List<Double> isActiveList, final GenomeLoc regionStartLoc) {
        this.parser = parser;
        this.presetRegions = presetRegions;
        this.isActiveList = isActiveList;
        this.regionStartLoc = regionStartLoc;
    }

    public void add(final GenomeLoc loc, final double score) {
        if (loc.size() != 1)
            throw new ReviewedStingException("Bad add call to ActivityProfile: loc " + loc + " size != 1");
        if (lastLoc != null && loc.getStart() != lastLoc.getStop() + 1)
            throw new ReviewedStingException(
                    "Bad add call to ActivityProfile: lastLoc added " + lastLoc + " and next is " + loc);
        isActiveList.add(score);
        if (regionStartLoc == null) {
            regionStartLoc = loc;
        }
    }

    public int size() {
        return isActiveList.size();
    }

    /**
     * Band pass this ActivityProfile, producing a new profile that's band pass filtered
     * @return a new ActivityProfile that's the band-pass filtered version of this profile
     */
    public ActivityProfile bandPassFilter() {
        final Double[] activeProbArray = isActiveList.toArray(new Double[isActiveList.size()]);
        final Double[] filteredProbArray = new Double[activeProbArray.length];
        if (!presetRegions) {
            for (int iii = 0; iii < activeProbArray.length; iii++) {
                final Double[] kernel = (Double[]) ArrayUtils.subarray(GaussianKernel,
                        Math.max(FILTER_SIZE - iii, 0),
                        Math.min(GaussianKernel.length, FILTER_SIZE + activeProbArray.length - iii));
                final Double[] activeProbSubArray = (Double[]) ArrayUtils.subarray(activeProbArray,
                        Math.max(0, iii - FILTER_SIZE), Math.min(activeProbArray.length, iii + FILTER_SIZE + 1));
                filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel);
            }
        }
        return new ActivityProfile(parser, presetRegions, Arrays.asList(filteredProbArray), regionStartLoc);
    }

    /**
     * Partition this profile into active regions
     * @param activeRegionExtension
     * @return
     */
    public List<ActiveRegion> createActiveRegions(final int activeRegionExtension, final int maxRegionSize) {
        final double ACTIVE_PROB_THRESHOLD = 0.002; // TODO: needs to be set-able by the walker author
        final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();

        if (isActiveList.size() == 0) {
            // no elements in the active list, just return an empty one
            return Collections.emptyList();
        } else if (isActiveList.size() == 1) {
            // there's a single element, it's either active or inactive
            boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
            returnList.addAll(createActiveRegion(isActive, 0, 0, activeRegionExtension, maxRegionSize));
        } else {
            // there are 2+ elements, divide these up into regions
            boolean isActive = isActiveList.get(0) > ACTIVE_PROB_THRESHOLD;
            int curStart = 0;
            for (int iii = 1; iii < isActiveList.size(); iii++) {
                final boolean thisStatus = isActiveList.get(iii) > ACTIVE_PROB_THRESHOLD;
                if (isActive != thisStatus) {
                    returnList.addAll(
                            createActiveRegion(isActive, curStart, iii - 1, activeRegionExtension, maxRegionSize));
                    isActive = thisStatus;
                    curStart = iii;
                }
            }
            returnList.addAll(createActiveRegion(isActive, curStart, isActiveList.size() - 1, activeRegionExtension,
                    maxRegionSize)); // close out the current active region
        }
        return returnList;
    }

    /**
     * Helper routine to create an active region based on our current start and end offsets
     * @param isActive should the region be active?
     * @param curStart offset (0-based) from the start of this region
     * @param curEnd offset (0-based) from the start of this region
     * @param activeRegionExtension
     * @return a fully initialized ActiveRegion with the above properties
     */
    private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart,
            final int curEnd, final int activeRegionExtension, final int maxRegionSize) {
        return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize,
                new ArrayList<ActiveRegion>());
    }

    private final List<ActiveRegion> createActiveRegion(final boolean isActive, final int curStart,
            final int curEnd, final int activeRegionExtension, final int maxRegionSize,
            final List<ActiveRegion> returnList) {
        if (!isActive || curEnd - curStart < maxRegionSize) {
            final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(),
                    regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd);
            returnList.add(new ActiveRegion(loc, isActive, parser, activeRegionExtension));
            return returnList;
        }
        // find the best place to break up the large active region
        Double minProb = Double.MAX_VALUE;
        int cutPoint = -1;
        for (int iii = curStart + 50; iii < curEnd - 50; iii++) { // BUGBUG: assumes maxRegionSize >> 50
            if (isActiveList.get(iii) < minProb) {
                minProb = isActiveList.get(iii);
                cutPoint = iii;
            }
        }
        final List<ActiveRegion> leftList = createActiveRegion(isActive, curStart, cutPoint, activeRegionExtension,
                maxRegionSize, new ArrayList<ActiveRegion>());
        final List<ActiveRegion> rightList = createActiveRegion(isActive, cutPoint + 1, curEnd,
                activeRegionExtension, maxRegionSize, new ArrayList<ActiveRegion>());
        returnList.addAll(leftList);
        returnList.addAll(rightList);
        return returnList;
    }
}