List of usage examples for org.apache.commons.math3.distribution PoissonDistribution PoissonDistribution
public PoissonDistribution(double p) throws NotStrictlyPositiveException
From source file:SimpleGui.QueryGenerator.java
/** * Creating specified number (numOfQuerySegments) of sub-queries * This should be first time to keep sub-queries set fixed for all inputs *//* w w w . j a va 2s. c om*/ public ArrayList<String> generateSubQuerySamples(int numOfQuerySegments) throws IOException { /* File file = new File("//home//santhilata//Dropbox//CacheLearning//QGen//src//main//java//QueryInput" + "//SubQueries.txt"); FileWriter fw = new FileWriter(file); fw.flush(); */ DatabaseDetails[] ddetails = this.getConfiguration().getDistributedEnvironment().getDatabaseSchema(); int querySegmentSample = 0; String tempSTR = ""; ArrayList<String> queries = new ArrayList<>(numOfQuerySegments); createSQSS(); createPQSS(); for (int i = 0; i < numOfQuerySegments; i++) { String query = ""; ArrayList<String> tempListSQS = new ArrayList<>(); int noOfSQSInQuery = 0; // This variable is to select more than one sqss. This is to ensure we get data from multiple databases while (noOfSQSInQuery == 0) { noOfSQSInQuery = new Random().nextInt(ddetails.length) + 1; // upto no.of databases } int seed = new Random().nextInt(8); while (seed == 0) { seed = new Random().nextInt(8); } PoissonDistribution poisson = new PoissonDistribution(noOfSQS / seed); UniformIntegerDistribution uid = new UniformIntegerDistribution(noOfSQS / (seed + 1), noOfSQS / (seed)); //adding SQS in the query for (int j = 0; j < noOfSQSInQuery; j++) { // String tempSTR = sqss.get(poisson.sample()); if (querySegmentSample > sqss.size() - 1) querySegmentSample = 0; else { tempSTR = sqss.get(querySegmentSample); querySegmentSample++; } if (!tempListSQS.contains(tempSTR)) tempListSQS.add(tempSTR); else j--; } for (int j = 0; j < noOfSQSInQuery; j++) { query = query + "<" + tempListSQS.get(j) + ">"; } //till here added sqs in a query int tempAttr = new Random().nextInt(noOfSQSInQuery); //to ensure that first attribute is taken from the chosen subject query segments while ((tempAttr == 0)) { if (noOfSQSInQuery == 1) tempAttr = 1; else tempAttr = new Random().nextInt(noOfSQSInQuery); } String attr1 = ""; if (tempAttr == 1) attr1 = tempListSQS.get(0); else attr1 = tempListSQS.get(tempAttr); String attr2 = attr1 + "*"; // what is this star for? while (attr2.contains(attr1)) { //this is to avoid repetition of attributes in the predicates // System.out.println(attr2 +" "+attr1); PoissonDistribution poisson_PQS = new PoissonDistribution(pqss.size() / seed); int poisson_sample = poisson_PQS.sample(); while (poisson_sample >= pqss.size()) { poisson_sample = poisson_PQS.sample(); } attr2 = pqss.get(poisson_sample); // System.out.println("new attr2 "+attr2); } if (attr2 == null) { attr1 = attr1 + "," + conditionArray[new Random().nextInt(conditionArray.length)]; long cardinality = Math.round(Math.abs(new Random().nextGaussian() * 1000)); attr1 = attr1 + "," + conditionArray[new Random().nextInt(conditionArray.length)] + "-" + cardinality; } String str = "<" + attr1 + attr2 + ">"; query = query + "#" + str; queries.add(query); } /* for (String query: queries ) { fw.append(query+"\n"); } fw.close(); */ this.sub_queries = queries; return queries; }
From source file:SimpleGui.QueryGenerator.java
/** * Returns array of values in a given distribution * @param distribution// www .ja v a 2 s. com * @param numQs * @param mean * @param variance * @param exponentialMean * @param u_lower * @param u_upper * @return */ public int[] getDistributionValues(String distribution, int numQs, double mean, double variance, double exponentialMean, int u_lower, int u_upper, double slope, int upperBoundary) { int[] values = new int[numQs]; switch (distribution) { case "Poisson": { RandomEngine engine = new DRand(); Poisson poisson = new Poisson(mean, engine); int poissonObs = poisson.nextInt(); Normal normal = new Normal(mean, variance, engine); for (int i = 0; i < numQs; i++) { double normalObs = normal.nextDouble(); int sample = (int) Math.abs(normalObs); while (sample >= upperBoundary) { normalObs = normal.nextDouble(); sample = (int) Math.abs(normalObs); } values[i] = sample; //System.out.println(values[i]+"from poisson"); //============================================= PoissonDistribution p = new PoissonDistribution(mean); int randomInt = p.sample(); // values[i] = randomInt; } // Arrays.sort(values); break; } case "Random": { Random randomGenerator = new Random(); for (int i = 0; i < numQs; i++) { int randomInt = randomGenerator.nextInt(numQs - 1); values[i] = randomInt + 1; // to avoid '0' } break; } case "Uniform": { for (int i = 0; i < numQs; i++) { UniformIntegerDistribution u = new UniformIntegerDistribution(u_lower, u_upper); int randomInt = u.sample(); while (randomInt >= upperBoundary) { randomInt = u.sample(); } if (randomInt == 0) values[i] = randomInt + 1; // to avoid random '0' else values[i] = randomInt; } break; } case "Exponential": { for (int i = 0; i < numQs; i++) { ExponentialDistribution e = new ExponentialDistribution(exponentialMean); double exponential = e.sample(); while (exponential >= upperBoundary - 1) { exponential = e.sample(); } values[i] = new Double(exponential).intValue() + 1; //to avoid random value '0' } break; } case "Grading": { // y=mx+c. increment and then decrement for positive slopes. int constant = 0; for (int i = 0; i < numQs; i++) { constant += slope; int nextVal = new Double(constant).intValue(); System.out.println(upperBoundary); if (nextVal >= upperBoundary || nextVal < 0) { slope = -1 * slope; i--; } else values[i] = nextVal; } break; } } // Arrays.sort(values); return values; }
From source file:uk.ac.babraham.SeqMonk.ProbeGenerators.MacsPeakCaller.java
public void run() { // for (int i=0;i<selectedChIPStores.length;i++) { // System.err.println("Selcted ChIP="+selectedChIPStores[i]); // }// w w w . j a va2s .c o m // for (int i=0;i<selectedInputStores.length;i++) { // System.err.println("Selcted Input="+selectedInputStores[i]); // } // First find the tag offsets between the watson and crick strands // Work out the total average coverage for all of the combined ChIP samples long totalChIPCoverage = 0; for (int i = 0; i < selectedChIPStores.length; i++) { totalChIPCoverage += selectedChIPStores[i].getTotalReadLength(); } if (cancel) { generationCancelled(); return; } double averageChIPCoveragePerBase = totalChIPCoverage / (double) collection.genome().getTotalGenomeLength(); double lowerCoverage = averageChIPCoveragePerBase * minFoldEnrichment; double upperCoverage = averageChIPCoveragePerBase * maxFoldEnrichment; System.err.println("Coverage range for high confidence peaks is " + lowerCoverage + " - " + upperCoverage); // Now we go through the data to find locations for our high confidence peaks so we can // randomly select 1000 of these to use to find the offset between the two strands Chromosome[] chromosomes = collection.genome().getAllChromosomes(); Vector<Probe> potentialHighConfidencePeaks = new Vector<Probe>(); for (int c = 0; c < chromosomes.length; c++) { if (cancel) { generationCancelled(); return; } // Time for an update updateGenerationProgress("Finding high confidence peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length); Probe lastValidProbe = null; for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) { // See if we need to quit if (cancel) { generationCancelled(); return; } long totalLength = 0; Probe probe = new Probe(chromosomes[c], startPosition, startPosition + fragmentSize); for (int s = 0; s < selectedChIPStores.length; s++) { long[] reads = selectedChIPStores[s].getReadsForProbe(probe); for (int j = 0; j < reads.length; j++) { totalLength += SequenceRead.length(reads[j]); } } if (totalLength >= (lowerCoverage * probe.length()) && totalLength <= upperCoverage * probe.length()) { // We have a potential high quality peak. // See if we can merge it with the last valid probe if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) { lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end()); } else if (lastValidProbe != null) { // Check that the overall density over the region falls within our limits totalLength = 0; for (int s = 0; s < selectedChIPStores.length; s++) { long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe); for (int j = 0; j < reads.length; j++) { totalLength += SequenceRead.length(reads[j]); } } if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) { potentialHighConfidencePeaks.add(lastValidProbe); } lastValidProbe = probe; } else { lastValidProbe = probe; } } } if (lastValidProbe != null) { long totalLength = 0; for (int s = 0; s < selectedChIPStores.length; s++) { long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe); for (int j = 0; j < reads.length; j++) { totalLength += SequenceRead.length(reads[j]); } } if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) { potentialHighConfidencePeaks.add(lastValidProbe); } } } if (potentialHighConfidencePeaks.size() == 0) { JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No high confidence peaks found", "Quitting generator", JOptionPane.INFORMATION_MESSAGE); generationCancelled(); return; } // System.err.println("Found "+potentialHighConfidencePeaks.size()+" high confidence peaks"); // Now we select 1000 random probes from this set Probe[] highConfidencePeaks = potentialHighConfidencePeaks.toArray(new Probe[0]); Collections.shuffle(Arrays.asList(highConfidencePeaks)); Probe[] randomHighConfidenceProbes = new Probe[Math.min(highConfidencePeaks.length, 1000)]; for (int i = 0; i < randomHighConfidenceProbes.length; i++) { randomHighConfidenceProbes[i] = highConfidencePeaks[i]; } // Now find the average distance between forward / reverse reads in the candidate peaks int[] distances = new int[highConfidencePeaks.length]; // Sort the candidates so we don't do stupid stuff with the cache Arrays.sort(randomHighConfidenceProbes); for (int p = 0; p < randomHighConfidenceProbes.length; p++) { // See if we need to quit if (cancel) { generationCancelled(); return; } distances[p] = getInterStrandDistance(randomHighConfidenceProbes[p], selectedChIPStores); } int medianInterStrandDistance = (int) SimpleStats.median(distances); if (medianInterStrandDistance < 0) medianInterStrandDistance = 0; // System.err.println("Median inter strand difference = "+medianInterStrandDistance); // Now we find the depth cutoff for overrepresented single tags using a binomial distribution int totalReadCount = 0; for (int i = 0; i < selectedChIPStores.length; i++) { totalReadCount += selectedChIPStores[i].getTotalReadCount(); } BinomialDistribution bin = new BinomialDistribution(totalReadCount, 1d / collection.genome().getTotalGenomeLength()); // We want to know what depth has a chance of less than 1^-5 int redundantThreshold = bin.inverseCumulativeProbability(1 - 0.00001d); if (redundantThreshold < 1) redundantThreshold = 1; // System.err.println("Redundancy threshold is "+redundantThreshold); // Now we construct a poisson distribution to work out the threshold to use for // constructing a full candidate peak set. updateGenerationProgress("Counting non-redundant reads", 0, 1); // To do this we need to get the full non-redundant length from the whole set int totalNonRedCount = getNonRedundantReadCount(selectedChIPStores, redundantThreshold); // System.err.println("Total non-redundant sequences is "+totalNonRedCount); // We need to know the median read length for the data int readLength = 0; for (int i = 0; i < selectedChIPStores.length; i++) { readLength += selectedChIPStores[i].getTotalReadLength() / selectedChIPStores[i].getTotalReadCount(); } readLength /= selectedChIPStores.length; double expectedCountsPerWindow = getExpectedCountPerWindow(totalNonRedCount, collection.genome().getTotalGenomeLength(), fragmentSize, readLength); PoissonDistribution poisson = new PoissonDistribution(expectedCountsPerWindow); int readCountCutoff = poisson.inverseCumulativeProbability(1 - pValue); // System.err.println("Threshold for enrichment in a window is "+readCountCutoff+" reads using a p-value of "+pValue+" and a mean of "+(totalNonRedCount/(collection.genome().getTotalGenomeLength()/(double)fragmentSize))); // Now we go back through the whole dataset to do a search for all possible candidate probes // We re-use the peak vector we came up with before. potentialHighConfidencePeaks.clear(); for (int c = 0; c < chromosomes.length; c++) { // Time for an update updateGenerationProgress("Finding candidate peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length); Probe lastValidProbe = null; for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) { // See if we need to quit if (cancel) { generationCancelled(); return; } // We expand the region we're looking at by the inter-strand distance as we're going to // be adjusting the read positions Probe probe = new Probe(chromosomes[c], startPosition, (startPosition + fragmentSize - 1)); long[] mergedProbeReads = getReadsFromDataStoreCollection(probe, selectedChIPStores, medianInterStrandDistance); mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold); SequenceRead.sort(mergedProbeReads); int thisProbeOverlapCount = 0; for (int i = 0; i < mergedProbeReads.length; i++) { if (SequenceRead.overlaps(mergedProbeReads[i], probe.packedPosition())) { ++thisProbeOverlapCount; } } if (thisProbeOverlapCount > readCountCutoff) { // We have a potential high quality peak. // See if we can merge it with the last valid probe if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) { lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end()); } else if (lastValidProbe != null) { potentialHighConfidencePeaks.add(lastValidProbe); lastValidProbe = probe; } else { lastValidProbe = probe; } } } if (lastValidProbe != null) { potentialHighConfidencePeaks.add(lastValidProbe); } } // Finally we re-filter the peaks we have using local poisson distributions with densities taken // from either the input samples (if there are any), or the local region. The densities are // estimated over 1,5 and 10kb around the peak and genome wide and the max of these is taken. // If there is no input then the 1kb region is not used. Probe[] allCandidateProbes = potentialHighConfidencePeaks.toArray(new Probe[0]); // Work out which stores we're using to validate against. DataStore[] validationStores; boolean useInput = false; double inputCorrection = 1; int validationNonRedCount; if (selectedInputStores != null && selectedInputStores.length > 0) { // See if we need to quit if (cancel) { generationCancelled(); return; } validationStores = selectedInputStores; useInput = true; // We also need to work out the total number of nonredundant seqences // in the input so we can work out a scaling factor so that the densities // for input and chip are comparable. validationNonRedCount = getNonRedundantReadCount(validationStores, redundantThreshold); inputCorrection = totalNonRedCount / (double) validationNonRedCount; System.err.println("From chip=" + totalNonRedCount + " input=" + validationNonRedCount + " correction is " + inputCorrection); } else { validationStores = selectedChIPStores; validationNonRedCount = totalNonRedCount; } Vector<Probe> finalValidatedProbes = new Vector<Probe>(); for (int p = 0; p < allCandidateProbes.length; p++) { // See if we need to quit if (cancel) { generationCancelled(); return; } if (p % 100 == 0) { updateGenerationProgress("Validated " + p + " out of " + allCandidateProbes.length + " raw peaks", p, allCandidateProbes.length); } // System.err.println("Validating "+allCandidateProbes[p].chromosome()+":"+allCandidateProbes[p].start()+"-"+allCandidateProbes[p].end()); // We now need to find the maximum read density per 2*bandwidth against which // we're going to validate this peak // We're going to get all reads within 10kb of the peak, and then we can subselect from there int midPoint = allCandidateProbes[p].middle(); Probe region10kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 5000, 1), Math.min(midPoint + 4999, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand()); Probe region5kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 2500, 1), Math.min(midPoint + 2499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand()); Probe region1kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 500, 1), Math.min(midPoint + 499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand()); // Get the probes for the largest region long[] thisRegionReads = getReadsFromDataStoreCollection(region10kb, validationStores, 0); // Deduplicate so it's a fair comparison thisRegionReads = deduplicateReads(thisRegionReads, redundantThreshold); // Should we recalculate the redundant threshold based on the input coverage? int region10kbcount = thisRegionReads.length; int region5kbcount = 0; int region1kbcount = 0; // Go through the reads seeing if they fit into the 5 or 1kb regions for (int r = 0; r < thisRegionReads.length; r++) { if (SequenceRead.overlaps(region5kb.packedPosition(), thisRegionReads[r])) ++region5kbcount; if (SequenceRead.overlaps(region1kb.packedPosition(), thisRegionReads[r])) ++region1kbcount; } // System.err.println("Input counts 10kb="+region10kbcount+" 5kb="+region5kbcount+" 1kb="+region1kbcount); // Convert to densities per window and ajdust for global coverage double globalDensity = getExpectedCountPerWindow(validationNonRedCount, collection.genome().getTotalGenomeLength(), allCandidateProbes[p].length(), readLength) * inputCorrection; double density10kb = getExpectedCountPerWindow(region10kbcount, region10kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection; double density5kb = getExpectedCountPerWindow(region5kbcount, region5kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection; double density1kb = getExpectedCountPerWindow(region1kbcount, region1kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection; // Find the highest density to use for the validation double highestDensity = globalDensity; if (density10kb > highestDensity) highestDensity = density10kb; if (density5kb > highestDensity) highestDensity = density5kb; if (useInput && density1kb > highestDensity) highestDensity = density1kb; // System.err.println("Global="+globalDensity+" 10kb="+density10kb+" 5kb="+density5kb+" 1kb="+density1kb+" using="+highestDensity); // Construct a poisson distribution with this density PoissonDistribution localPoisson = new PoissonDistribution(highestDensity); // System.err.println("Cutoff from global="+(new PoissonDistribution(globalDensity)).inverseCumulativeProbability(1-pValue)+" 10kb="+(new PoissonDistribution(density10kb)).inverseCumulativeProbability(1-pValue)+" 5kb="+(new PoissonDistribution(density5kb)).inverseCumulativeProbability(1-pValue)+" 1kb="+(new PoissonDistribution(density1kb)).inverseCumulativeProbability(1-pValue)); // Now check to see if the actual count from this peak is enough to still pass long[] mergedProbeReads = getReadsFromDataStoreCollection(allCandidateProbes[p], selectedChIPStores, medianInterStrandDistance); mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold); SequenceRead.sort(mergedProbeReads); int thisProbeOverlapCount = 0; for (int i = 0; i < mergedProbeReads.length; i++) { if (SequenceRead.overlaps(mergedProbeReads[i], allCandidateProbes[p].packedPosition())) { ++thisProbeOverlapCount; } } // System.err.println("Read count in ChIP is "+thisProbeOverlapCount); if (thisProbeOverlapCount > localPoisson.inverseCumulativeProbability(1 - pValue)) { finalValidatedProbes.add(allCandidateProbes[p]); // System.err.println("Adding probe to final set"); } } // System.err.println("From "+allCandidateProbes.length+" candidates "+finalValidatedProbes.size()+" peaks were validated"); ProbeSet finalSet = new ProbeSet(getDescription(), finalValidatedProbes.toArray(new Probe[0])); generationComplete(finalSet); }