Example usage for org.apache.commons.math3.distribution PoissonDistribution inverseCumulativeProbability

Introduction

In this page you can find the example usage for org.apache.commons.math3.distribution PoissonDistribution inverseCumulativeProbability.

Prototype

public int inverseCumulativeProbability(final double p) throws OutOfRangeException

Source Link

Document

The default implementation returns

#getSupportLowerBound() for p = 0 ,
#getSupportUpperBound() for p = 1 , and
#solveInverseCumulativeProbability(double,int,int) for 0 < p < 1 .

Usage

From source file:uk.ac.babraham.SeqMonk.ProbeGenerators.MacsPeakCaller.java

public void run() {

    //      for (int i=0;i<selectedChIPStores.length;i++) {
    //         System.err.println("Selcted ChIP="+selectedChIPStores[i]);
    //      }//from  w  ww  .ja  v a2  s. c  o  m
    //      for (int i=0;i<selectedInputStores.length;i++) {
    //         System.err.println("Selcted Input="+selectedInputStores[i]);
    //      }

    // First find the tag offsets between the watson and crick strands

    // Work out the total average coverage for all of the combined ChIP samples
    long totalChIPCoverage = 0;

    for (int i = 0; i < selectedChIPStores.length; i++) {
        totalChIPCoverage += selectedChIPStores[i].getTotalReadLength();
    }

    if (cancel) {
        generationCancelled();
        return;
    }

    double averageChIPCoveragePerBase = totalChIPCoverage / (double) collection.genome().getTotalGenomeLength();

    double lowerCoverage = averageChIPCoveragePerBase * minFoldEnrichment;
    double upperCoverage = averageChIPCoveragePerBase * maxFoldEnrichment;

    System.err.println("Coverage range for high confidence peaks is " + lowerCoverage + " - " + upperCoverage);

    // Now we go through the data to find locations for our high confidence peaks so we can
    // randomly select 1000 of these to use to find the offset between the two strands

    Chromosome[] chromosomes = collection.genome().getAllChromosomes();

    Vector<Probe> potentialHighConfidencePeaks = new Vector<Probe>();

    for (int c = 0; c < chromosomes.length; c++) {

        if (cancel) {
            generationCancelled();
            return;
        }

        // Time for an update
        updateGenerationProgress("Finding high confidence peaks on chromosome " + chromosomes[c].name(), c,
                chromosomes.length);

        Probe lastValidProbe = null;

        for (int startPosition = 1; startPosition < chromosomes[c].length()
                - fragmentSize; startPosition += fragmentSize / 2) {

            // See if we need to quit
            if (cancel) {
                generationCancelled();
                return;
            }

            long totalLength = 0;
            Probe probe = new Probe(chromosomes[c], startPosition, startPosition + fragmentSize);

            for (int s = 0; s < selectedChIPStores.length; s++) {
                long[] reads = selectedChIPStores[s].getReadsForProbe(probe);
                for (int j = 0; j < reads.length; j++) {
                    totalLength += SequenceRead.length(reads[j]);
                }
            }

            if (totalLength >= (lowerCoverage * probe.length())
                    && totalLength <= upperCoverage * probe.length()) {

                // We have a potential high quality peak.

                // See if we can merge it with the last valid probe

                if (lastValidProbe != null
                        && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {

                    lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());

                } else if (lastValidProbe != null) {

                    // Check that the overall density over the region falls within our limits
                    totalLength = 0;
                    for (int s = 0; s < selectedChIPStores.length; s++) {
                        long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
                        for (int j = 0; j < reads.length; j++) {
                            totalLength += SequenceRead.length(reads[j]);
                        }
                    }

                    if (totalLength >= (lowerCoverage * lastValidProbe.length())
                            && totalLength <= upperCoverage * lastValidProbe.length()) {
                        potentialHighConfidencePeaks.add(lastValidProbe);
                    }

                    lastValidProbe = probe;
                } else {
                    lastValidProbe = probe;
                }
            }

        }

        if (lastValidProbe != null) {
            long totalLength = 0;
            for (int s = 0; s < selectedChIPStores.length; s++) {
                long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
                for (int j = 0; j < reads.length; j++) {
                    totalLength += SequenceRead.length(reads[j]);
                }
            }

            if (totalLength >= (lowerCoverage * lastValidProbe.length())
                    && totalLength <= upperCoverage * lastValidProbe.length()) {
                potentialHighConfidencePeaks.add(lastValidProbe);
            }
        }
    }

    if (potentialHighConfidencePeaks.size() == 0) {

        JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No high confidence peaks found",
                "Quitting generator", JOptionPane.INFORMATION_MESSAGE);
        generationCancelled();
        return;
    }

    //      System.err.println("Found "+potentialHighConfidencePeaks.size()+" high confidence peaks");

    // Now we select 1000 random probes from this set
    Probe[] highConfidencePeaks = potentialHighConfidencePeaks.toArray(new Probe[0]);

    Collections.shuffle(Arrays.asList(highConfidencePeaks));

    Probe[] randomHighConfidenceProbes = new Probe[Math.min(highConfidencePeaks.length, 1000)];

    for (int i = 0; i < randomHighConfidenceProbes.length; i++) {
        randomHighConfidenceProbes[i] = highConfidencePeaks[i];
    }

    // Now find the average distance between forward / reverse reads in the candidate peaks

    int[] distances = new int[highConfidencePeaks.length];

    // Sort the candidates so we don't do stupid stuff with the cache
    Arrays.sort(randomHighConfidenceProbes);

    for (int p = 0; p < randomHighConfidenceProbes.length; p++) {

        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }

        distances[p] = getInterStrandDistance(randomHighConfidenceProbes[p], selectedChIPStores);
    }

    int medianInterStrandDistance = (int) SimpleStats.median(distances);

    if (medianInterStrandDistance < 0)
        medianInterStrandDistance = 0;

    //      System.err.println("Median inter strand difference = "+medianInterStrandDistance);

    // Now we find the depth cutoff for overrepresented single tags using a binomial distribution
    int totalReadCount = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        totalReadCount += selectedChIPStores[i].getTotalReadCount();
    }

    BinomialDistribution bin = new BinomialDistribution(totalReadCount,
            1d / collection.genome().getTotalGenomeLength());

    // We want to know what depth has a chance of less than 1^-5
    int redundantThreshold = bin.inverseCumulativeProbability(1 - 0.00001d);

    if (redundantThreshold < 1)
        redundantThreshold = 1;

    //      System.err.println("Redundancy threshold is "+redundantThreshold);

    // Now we construct a poisson distribution to work out the threshold to use for
    // constructing a full candidate peak set.

    updateGenerationProgress("Counting non-redundant reads", 0, 1);

    // To do this we need to get the full non-redundant length from the whole set
    int totalNonRedCount = getNonRedundantReadCount(selectedChIPStores, redundantThreshold);

    //      System.err.println("Total non-redundant sequences is "+totalNonRedCount);

    // We need to know the median read length for the data
    int readLength = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        readLength += selectedChIPStores[i].getTotalReadLength() / selectedChIPStores[i].getTotalReadCount();
    }
    readLength /= selectedChIPStores.length;

    double expectedCountsPerWindow = getExpectedCountPerWindow(totalNonRedCount,
            collection.genome().getTotalGenomeLength(), fragmentSize, readLength);

    PoissonDistribution poisson = new PoissonDistribution(expectedCountsPerWindow);

    int readCountCutoff = poisson.inverseCumulativeProbability(1 - pValue);

    //      System.err.println("Threshold for enrichment in a window is "+readCountCutoff+" reads using a p-value of "+pValue+" and a mean of "+(totalNonRedCount/(collection.genome().getTotalGenomeLength()/(double)fragmentSize)));

    // Now we go back through the whole dataset to do a search for all possible candidate probes 

    // We re-use the peak vector we came up with before.
    potentialHighConfidencePeaks.clear();

    for (int c = 0; c < chromosomes.length; c++) {
        // Time for an update
        updateGenerationProgress("Finding candidate peaks on chromosome " + chromosomes[c].name(), c,
                chromosomes.length);

        Probe lastValidProbe = null;

        for (int startPosition = 1; startPosition < chromosomes[c].length()
                - fragmentSize; startPosition += fragmentSize / 2) {

            // See if we need to quit
            if (cancel) {
                generationCancelled();
                return;
            }

            // We expand the region we're looking at by the inter-strand distance as we're going to
            // be adjusting the read positions
            Probe probe = new Probe(chromosomes[c], startPosition, (startPosition + fragmentSize - 1));

            long[] mergedProbeReads = getReadsFromDataStoreCollection(probe, selectedChIPStores,
                    medianInterStrandDistance);

            mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);

            SequenceRead.sort(mergedProbeReads);

            int thisProbeOverlapCount = 0;
            for (int i = 0; i < mergedProbeReads.length; i++) {
                if (SequenceRead.overlaps(mergedProbeReads[i], probe.packedPosition())) {
                    ++thisProbeOverlapCount;
                }
            }

            if (thisProbeOverlapCount > readCountCutoff) {

                // We have a potential high quality peak.

                // See if we can merge it with the last valid probe

                if (lastValidProbe != null
                        && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {

                    lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());

                } else if (lastValidProbe != null) {
                    potentialHighConfidencePeaks.add(lastValidProbe);
                    lastValidProbe = probe;
                } else {
                    lastValidProbe = probe;
                }
            }

        }

        if (lastValidProbe != null) {
            potentialHighConfidencePeaks.add(lastValidProbe);
        }
    }

    // Finally we re-filter the peaks we have using local poisson distributions with densities taken
    // from either the input samples (if there are any), or the local region.  The densities are 
    // estimated over 1,5 and 10kb around the peak and genome wide and the max of these is taken.
    // If there is no input then the 1kb region is not used.

    Probe[] allCandidateProbes = potentialHighConfidencePeaks.toArray(new Probe[0]);

    // Work out which stores we're using to validate against.
    DataStore[] validationStores;
    boolean useInput = false;
    double inputCorrection = 1;

    int validationNonRedCount;

    if (selectedInputStores != null && selectedInputStores.length > 0) {

        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }

        validationStores = selectedInputStores;
        useInput = true;

        // We also need to work out the total number of nonredundant seqences
        // in the input so we can work out a scaling factor so that the densities
        // for input and chip are comparable.
        validationNonRedCount = getNonRedundantReadCount(validationStores, redundantThreshold);

        inputCorrection = totalNonRedCount / (double) validationNonRedCount;

        System.err.println("From chip=" + totalNonRedCount + " input=" + validationNonRedCount
                + " correction is " + inputCorrection);

    } else {
        validationStores = selectedChIPStores;
        validationNonRedCount = totalNonRedCount;
    }

    Vector<Probe> finalValidatedProbes = new Vector<Probe>();

    for (int p = 0; p < allCandidateProbes.length; p++) {

        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }

        if (p % 100 == 0) {
            updateGenerationProgress("Validated " + p + " out of " + allCandidateProbes.length + " raw peaks",
                    p, allCandidateProbes.length);
        }

        //         System.err.println("Validating "+allCandidateProbes[p].chromosome()+":"+allCandidateProbes[p].start()+"-"+allCandidateProbes[p].end());

        // We now need to find the maximum read density per 2*bandwidth against which
        // we're going to validate this peak

        // We're going to get all reads within 10kb of the peak, and then we can subselect from there

        int midPoint = allCandidateProbes[p].middle();

        Probe region10kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 5000, 1),
                Math.min(midPoint + 4999, allCandidateProbes[p].chromosome().length()),
                allCandidateProbes[p].strand());
        Probe region5kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 2500, 1),
                Math.min(midPoint + 2499, allCandidateProbes[p].chromosome().length()),
                allCandidateProbes[p].strand());
        Probe region1kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 500, 1),
                Math.min(midPoint + 499, allCandidateProbes[p].chromosome().length()),
                allCandidateProbes[p].strand());

        // Get the probes for the largest region
        long[] thisRegionReads = getReadsFromDataStoreCollection(region10kb, validationStores, 0);

        // Deduplicate so it's a fair comparison
        thisRegionReads = deduplicateReads(thisRegionReads, redundantThreshold); // Should we recalculate the redundant threshold based on the input coverage?

        int region10kbcount = thisRegionReads.length;
        int region5kbcount = 0;
        int region1kbcount = 0;

        // Go through the reads seeing if they fit into the 5 or 1kb regions
        for (int r = 0; r < thisRegionReads.length; r++) {
            if (SequenceRead.overlaps(region5kb.packedPosition(), thisRegionReads[r]))
                ++region5kbcount;
            if (SequenceRead.overlaps(region1kb.packedPosition(), thisRegionReads[r]))
                ++region1kbcount;
        }

        //         System.err.println("Input counts 10kb="+region10kbcount+" 5kb="+region5kbcount+" 1kb="+region1kbcount);

        // Convert to densities per window and ajdust for global coverage

        double globalDensity = getExpectedCountPerWindow(validationNonRedCount,
                collection.genome().getTotalGenomeLength(), allCandidateProbes[p].length(), readLength)
                * inputCorrection;
        double density10kb = getExpectedCountPerWindow(region10kbcount, region10kb.length(),
                allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density5kb = getExpectedCountPerWindow(region5kbcount, region5kb.length(),
                allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density1kb = getExpectedCountPerWindow(region1kbcount, region1kb.length(),
                allCandidateProbes[p].length(), readLength) * inputCorrection;

        // Find the highest density to use for the validation
        double highestDensity = globalDensity;
        if (density10kb > highestDensity)
            highestDensity = density10kb;
        if (density5kb > highestDensity)
            highestDensity = density5kb;
        if (useInput && density1kb > highestDensity)
            highestDensity = density1kb;

        //         System.err.println("Global="+globalDensity+" 10kb="+density10kb+" 5kb="+density5kb+" 1kb="+density1kb+" using="+highestDensity);

        // Construct a poisson distribution with this density
        PoissonDistribution localPoisson = new PoissonDistribution(highestDensity);

        //         System.err.println("Cutoff from global="+(new PoissonDistribution(globalDensity)).inverseCumulativeProbability(1-pValue)+" 10kb="+(new PoissonDistribution(density10kb)).inverseCumulativeProbability(1-pValue)+" 5kb="+(new PoissonDistribution(density5kb)).inverseCumulativeProbability(1-pValue)+" 1kb="+(new PoissonDistribution(density1kb)).inverseCumulativeProbability(1-pValue));
        // Now check to see if the actual count from this peak is enough to still pass
        long[] mergedProbeReads = getReadsFromDataStoreCollection(allCandidateProbes[p], selectedChIPStores,
                medianInterStrandDistance);
        mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);

        SequenceRead.sort(mergedProbeReads);

        int thisProbeOverlapCount = 0;
        for (int i = 0; i < mergedProbeReads.length; i++) {
            if (SequenceRead.overlaps(mergedProbeReads[i], allCandidateProbes[p].packedPosition())) {
                ++thisProbeOverlapCount;
            }
        }

        //         System.err.println("Read count in ChIP is "+thisProbeOverlapCount);

        if (thisProbeOverlapCount > localPoisson.inverseCumulativeProbability(1 - pValue)) {
            finalValidatedProbes.add(allCandidateProbes[p]);
            //            System.err.println("Adding probe to final set");
        }

    }

    //      System.err.println("From "+allCandidateProbes.length+" candidates "+finalValidatedProbes.size()+" peaks were validated");

    ProbeSet finalSet = new ProbeSet(getDescription(), finalValidatedProbes.toArray(new Probe[0]));

    generationComplete(finalSet);

}