List of usage examples for org.apache.commons.math.stat.descriptive DescriptiveStatistics getN
public long getN()
From source file:com.mozilla.socorro.RawDumpSizeScan.java
public static void main(String[] args) throws ParseException { String startDateStr = args[0]; String endDateStr = args[1];/*from w w w. j ava 2 s. co m*/ // Set both start/end time and start/stop row Calendar startCal = Calendar.getInstance(); Calendar endCal = Calendar.getInstance(); SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); if (!StringUtils.isBlank(startDateStr)) { startCal.setTime(sdf.parse(startDateStr)); } if (!StringUtils.isBlank(endDateStr)) { endCal.setTime(sdf.parse(endDateStr)); } DescriptiveStatistics stats = new DescriptiveStatistics(); long numNullRawBytes = 0L; HTable table = null; Map<String, Integer> rowValueSizeMap = new HashMap<String, Integer>(); try { table = new HTable(TABLE_NAME_CRASH_REPORTS); Scan[] scans = generateScans(startCal, endCal); for (Scan s : scans) { ResultScanner rs = table.getScanner(s); Iterator<Result> iter = rs.iterator(); while (iter.hasNext()) { Result r = iter.next(); ImmutableBytesWritable rawBytes = r.getBytes(); //length = r.getValue(RAW_DATA_BYTES, DUMP_BYTES); if (rawBytes != null) { int length = rawBytes.getLength(); if (length > 20971520) { rowValueSizeMap.put(new String(r.getRow()), length); } stats.addValue(length); } else { numNullRawBytes++; } if (stats.getN() % 10000 == 0) { System.out.println("Processed " + stats.getN()); System.out.println(String.format("Min: %.02f Max: %.02f Mean: %.02f", stats.getMin(), stats.getMax(), stats.getMean())); System.out.println( String.format("1st Quartile: %.02f 2nd Quartile: %.02f 3rd Quartile: %.02f", stats.getPercentile(25.0d), stats.getPercentile(50.0d), stats.getPercentile(75.0d))); System.out.println("Number of large entries: " + rowValueSizeMap.size()); } } rs.close(); } System.out.println("Finished Processing!"); System.out.println(String.format("Min: %.02f Max: %.02f Mean: %.02f", stats.getMin(), stats.getMax(), stats.getMean())); System.out.println(String.format("1st Quartile: %.02f 2nd Quartile: %.02f 3rd Quartile: %.02f", stats.getPercentile(25.0d), stats.getPercentile(50.0d), stats.getPercentile(75.0d))); for (Map.Entry<String, Integer> entry : rowValueSizeMap.entrySet()) { System.out.println(String.format("RowId: %s => Length: %d", entry.getKey(), entry.getValue())); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { if (table != null) { try { table.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
From source file:cs.cirg.cida.analysis.MannWhitneyUTest.java
@Override public DataTable performTest(String... variableNames) { if (this.getExperiments().size() < 2) { return null; }/*from www . j a va 2 s.com*/ IExperiment experiment1 = this.getExperiments().get(0); IExperiment experiment2 = this.getExperiments().get(1); DescriptiveStatistics stats1 = experiment1.getBottomRowStatistics(variableNames[0]); DescriptiveStatistics stats2 = experiment2.getBottomRowStatistics(variableNames[0]); double[] xA = new double[(int) stats1.getN()]; double[] xB = new double[(int) stats2.getN()]; for (int i = 0; i < xA.length; i++) { xA[i] = stats1.getElement(i); xB[i] = stats2.getElement(i); } MannWhitneyTest mannWhitneyTest = new MannWhitneyTest(xA, xB, alternativeHypothesis, 0.0, false); results = new StandardDataTable<StringType>(); ArrayList<StringType> row = new ArrayList<StringType>(); row.add(new StringType("N_A")); row.add(new StringType(stats1.getN() + "")); results.addRow(row); row = new ArrayList<StringType>(); row.add(new StringType("N_B")); row.add(new StringType(stats2.getN() + "")); results.addRow(row); row = new ArrayList<StringType>(); row.add(new StringType("P value")); row.add(new StringType(mannWhitneyTest.exactSP() + "")); results.addRow(row); row = new ArrayList<StringType>(); row.add(new StringType("Approx P value")); row.add(new StringType(mannWhitneyTest.approxSP() + "")); results.addRow(row); row = new ArrayList<StringType>(); row.add(new StringType("Ranks Sum A")); row.add(new StringType(mannWhitneyTest.getRankSumA() + "")); results.addRow(row); row = new ArrayList<StringType>(); row.add(new StringType("Ranks Sum B")); row.add(new StringType(mannWhitneyTest.getRankSumB() + "")); results.addRow(row); row = new ArrayList<StringType>(); row.add(new StringType("U")); row.add(new StringType(mannWhitneyTest.getStatistic() + "")); results.addRow(row); row = new ArrayList<StringType>(); row.add(new StringType("Z")); row.add(new StringType(mannWhitneyTest.getZ() + "")); results.addRow(row); results.setColumnName(0, "Statistic"); results.setColumnName(1, "Value"); return this.getResults(); }
From source file:cal.binBased.BinMSnSpectrum.java
public double[] getBin_spectrum(int shift) { ArrayList<Double> bin_spec_al = new ArrayList<Double>(); double binSize = (fragment_tolerance * 2), upperLimit = max_value + 0.00001; for (double lowerLimit = min_value; lowerLimit < upperLimit; lowerLimit = lowerLimit + binSize) { double tmp_intensity_bin = 0; DescriptiveStatistics obj = new DescriptiveStatistics(); for (Peak p : peakList) { double mz = p.getMz() + shift; if (mz >= lowerLimit && mz < lowerLimit + binSize) { obj.addValue(p.intensity); }/*from w w w .j a v a2 s .c o m*/ } if (obj.getN() > 0) { if (intensities_sum_or_mean_or_median == 0) { tmp_intensity_bin = obj.getSum(); } else if (intensities_sum_or_mean_or_median == 1) { tmp_intensity_bin = obj.getMean(); } else if (intensities_sum_or_mean_or_median == 2) { tmp_intensity_bin = obj.getPercentile(50); } } // put every bin_pectrum bin_spec_al.add(tmp_intensity_bin); } // convert an arraylist to double array // initiate size of array bin_size = bin_spec_al.size(); double[] bin_spectrum = new double[bin_spec_al.size()]; for (int i = 0; i < bin_spec_al.size(); i++) { bin_spectrum[i] = bin_spec_al.get(i); } return bin_spectrum; }
From source file:cal.binBased.BinMSnSpectrum.java
private ArrayList<double[]> prepareBinSpectra() { // first prepare bin-spectrum to be filled with zero int size = (2 * correctionFactor) + 1; ArrayList<double[]> shiftedSpectra = new ArrayList<double[]>(size); for (int i = 0; i < size; i++) { double[] shiftedSpectrum = new double[bin_size]; shiftedSpectra.add(shiftedSpectrum); }/*from w w w . ja v a 2s . com*/ // now fill each bin spectrum with correct mz values. double binSize = (fragment_tolerance * 2), upperLimit = max_value + 0.00001; int current_index = 0; for (double lowerLimit = min_value + correctionFactor; lowerLimit < upperLimit - correctionFactor; lowerLimit = lowerLimit + binSize) { double tmp_intensity_bin = 0; DescriptiveStatistics obj = new DescriptiveStatistics(); for (Peak p : peakList) { double mz = p.getMz(); if (mz >= lowerLimit && mz < lowerLimit + binSize) { obj.addValue(p.intensity); } } if (obj.getN() > 0) { if (intensities_sum_or_mean_or_median == 0) { tmp_intensity_bin = obj.getSum(); } else if (intensities_sum_or_mean_or_median == 1) { tmp_intensity_bin = obj.getMean(); } else if (intensities_sum_or_mean_or_median == 2) { tmp_intensity_bin = obj.getPercentile(50); } } // put every bin_pectrum int filling_index = current_index; // check every bin spectrum for (double[] shifted : shiftedSpectra) { shifted[filling_index] = tmp_intensity_bin; filling_index++; } current_index++; } return shiftedSpectra; }
From source file:edu.usc.goffish.gopher.sample.N_Hop_Stat_Collector.java
@Override public void compute(List<SubGraphMessage> subGraphMessages) { /**//from ww w .j a v a 2 s . co m * We do this in following steps. * Calculate stats for each subgraph. * Calculate aggregate stats for partition. * In this case a single sub-graph will do the aggregation * Aggregate partition level stats and combine at the smallest partition. */ if (superStep == 0) { SubGraphMessage msg = subGraphMessages.get(0); String data = new String(msg.getData()); String[] dataSplit = data.split("#"); N = Integer.parseInt(dataSplit[0]); String[] vps = dataSplit[1].split(","); for (String vp : vps) { vantagePoints.add(vp.trim()); } try { Iterable<? extends ISubgraphInstance> subgraphInstances = subgraph.getInstances(Long.MIN_VALUE, Long.MAX_VALUE, PropertySet.EmptyPropertySet, subgraph.getEdgeProperties(), false); // sliceManager.readInstances(subgraph, // Long.MIN_VALUE, Long.MAX_VALUE, // PropertySet.EmptyPropertySet, subgraph.getEdgeProperties()); for (ISubgraphInstance instance : subgraphInstances) { Map<String, DescriptiveStatistics> statsMap = new HashMap<String, DescriptiveStatistics>(); for (TemplateEdge edge : subgraph.edges()) { ISubgraphObjectProperties edgeProps = instance.getPropertiesForEdge(edge.getId()); Integer isExist = (Integer) edgeProps.getValue(IS_EXIST_PROP); if (isExist == 1) { String[] vantageIps = ((String) edgeProps.getValue(VANTAGE_IP_PROP)).split(","); String[] latencies = ((String) edgeProps.getValue(LATENCY_PROP)).split(","); String[] hops = ((String) edgeProps.getValue(HOP_PROP)).split(","); Integer[] vantangeIdx = vantageIpIndex(vantageIps); if (vantangeIdx == null) { continue; } for (int i : vantangeIdx) { String vantage = vantageIps[i]; String latency = latencies[i]; String hop = hops[i]; double latency_num = Double.parseDouble(latency); int hop_num = Integer.parseInt(hop); if (latency_num >= 0 && hop_num == N) { if (statsMap.containsKey(vantage)) { statsMap.get(vantage).addValue(latency_num); } else { DescriptiveStatistics statistics = new DescriptiveStatistics(); statistics.addValue(latency_num); statsMap.put(vantage, statistics); } } ; } } } int c = 0; StringBuffer msgBuffer = new StringBuffer(); for (String v : statsMap.keySet()) { c++; DescriptiveStatistics statistics = statsMap.get(v); String m = createMessageString(v, instance.getTimestampStart(), instance.getTimestampEnd(), statistics.getStandardDeviation(), statistics.getMean(), statistics.getN()); if (c == statsMap.keySet().size()) { msgBuffer.append(m); } else { msgBuffer.append(m).append("|"); } } SubGraphMessage subMsg = new SubGraphMessage(msgBuffer.toString().getBytes()); sentMessage(partition.getId(), subMsg); } } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } } else if (superStep == 1) { //Ok here every sub-graph will receive message from its own partition. //Each message is belongs to a given some time span. Map<String, List<String[]>> vantageGroup = new HashMap<String, List<String[]>>(); for (SubGraphMessage subGraphMessage : subGraphMessages) { String msgData = new String(subGraphMessage.getData()); String[] dataParts = msgData.split("|"); for (String data : dataParts) { String[] vantageParts = data.split(","); //Group by vantage point and startTime if (vantageGroup.containsKey(vantageParts[0] + "|" + vantageParts[1])) { vantageGroup.get(vantageParts[0] + "|" + vantageParts[1]).add(vantageParts); } else { ArrayList<String[]> arrayList = new ArrayList<String[]>(); arrayList.add(vantageParts); vantageGroup.put(vantageParts[0] + "|" + vantageParts[1], arrayList); } } } for (String key : vantageGroup.keySet()) { if (!acquireLock(key)) { continue; } List<String[]> data = vantageGroup.get(key); double totalN = 0; double totalAvgVal = 0; double totalVar = 0; for (String[] d : data) { //average double mean = Double.parseDouble(d[4]); long sN = Long.parseLong(d[5]); totalN += sN; totalAvgVal += mean * sN; double sd = Double.parseDouble(d[3]); totalVar += ((double) sd * sd) / ((double) sN); } double avg = totalAvgVal / totalN; double newSD = Math.sqrt(totalVar); //create message //sent to all the partitions except me. String msg = key + "," + newSD + "," + avg + "," + totalN; for (int pid : partitions) { sentMessage(pid, new SubGraphMessage(msg.getBytes())); } } } else if (superStep >= 2) { if (partition.getId() == Collections.min(partitions)) { Map<String, List<String[]>> group = new HashMap<String, List<String[]>>(); for (SubGraphMessage msg : subGraphMessages) { String data = new String(msg.getData()); String[] dataParts = data.split(","); if (group.containsKey(dataParts[0])) { group.get(dataParts[0]).add(dataParts); } else { List<String[]> list = new ArrayList<String[]>(); list.add(dataParts); group.put(dataParts[0], list); } } if (!acquireLock("" + partition.getId())) { voteToHalt(); return; } PrintWriter writer; try { writer = new PrintWriter(new FileWriter("TimeSeriesStats.csv")); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } for (String key : group.keySet()) { List<String[]> data = group.get(key); double totalN = 0; double totalAvgVal = 0; double totalVar = 0; for (String[] d : data) { //average //key + "," + newSD + "," + avg + "," + totalN; double mean = Double.parseDouble(d[2]); long sN = Long.parseLong(d[3]); totalN += sN; totalAvgVal += mean * sN; double sd = Double.parseDouble(d[1]); totalVar += ((double) sd * sd) / ((double) sN); } double avg = totalAvgVal / totalN; double newSD = Math.sqrt(totalVar); String vantage = key.split("|")[0]; String timeStamp = key.split("|")[1]; log(writer, vantage, timeStamp, avg, newSD); } writer.flush(); voteToHalt(); } } }
From source file:guineu.modules.dataanalysis.foldChanges.FoldTestTask.java
public double Foldtest(int mol) throws IllegalArgumentException, MathException { DescriptiveStatistics stats1 = new DescriptiveStatistics(); DescriptiveStatistics stats2 = new DescriptiveStatistics(); String parameter1 = ""; try {// w w w. j a v a2 s . com // Determine groups for selected raw data files List<String> availableParameterValues = dataset.getParameterAvailableValues(parameter); int numberOfGroups = availableParameterValues.size(); if (numberOfGroups > 1) { parameter1 = availableParameterValues.get(0); String parameter2 = availableParameterValues.get(1); for (String sampleName : dataset.getAllColumnNames()) { if (dataset.getParametersValue(sampleName, parameter) != null && dataset.getParametersValue(sampleName, parameter).equals(parameter1)) { stats1.addValue((Double) this.dataset.getRow(mol).getPeak(sampleName)); } else if (dataset.getParametersValue(sampleName, parameter) != null && dataset.getParametersValue(sampleName, parameter).equals(parameter2)) { stats2.addValue((Double) this.dataset.getRow(mol).getPeak(sampleName)); } } } else { return -1; } } catch (Exception e) { e.printStackTrace(); } if (stats1.getN() > 0 && stats2.getN() > 0) { /*double[] sortValues1 = stats1.getSortedValues(); double[] sortValues2 = stats2.getSortedValues(); return sortValues1[((int) stats1.getN() / 2)] / sortValues2[((int) stats2.getN() / 2)];*/ return stats1.getMean() / stats2.getMean(); } else { return 0; } }
From source file:com.joliciel.talismane.other.corpus.CorpusStatistics.java
@Override public void onNextParseConfiguration(ParseConfiguration parseConfiguration, Writer writer) { sentenceCount++;//from w w w. j a va 2s . c om sentenceLengthStats.addValue(parseConfiguration.getPosTagSequence().size()); for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) { if (posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) continue; Token token = posTaggedToken.getToken(); String word = token.getOriginalText(); words.add(word); if (referenceWords != null) { if (!referenceWords.contains(word)) unknownTokenCount++; } if (alphanumeric.matcher(token.getOriginalText()).find()) { String lowercase = word.toLowerCase(TalismaneSession.getLocale()); lowerCaseWords.add(lowercase); alphanumericCount++; if (referenceLowercaseWords != null) { if (!referenceLowercaseWords.contains(lowercase)) unknownAlphanumericCount++; } } tokenCount++; Integer countObj = posTagCounts.get(posTaggedToken.getTag().getCode()); int count = countObj == null ? 0 : countObj.intValue(); count++; posTagCounts.put(posTaggedToken.getTag().getCode(), count); } int maxDepth = 0; DescriptiveStatistics avgSyntaxDepthForSentenceStats = new DescriptiveStatistics(); for (DependencyArc arc : parseConfiguration.getDependencies()) { Integer countObj = depLabelCounts.get(arc.getLabel()); int count = countObj == null ? 0 : countObj.intValue(); count++; depLabelCounts.put(arc.getLabel(), count); totalDepCount++; if (arc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && (arc.getLabel() == null || arc.getLabel().length() == 0)) { // do nothing for unattached stuff (e.g. punctuation) } else if (arc.getLabel().equals("ponct")) { // do nothing for punctuation } else { int depth = 0; DependencyArc theArc = arc; while (theArc != null && !theArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG)) { theArc = parseConfiguration.getGoverningDependency(theArc.getHead()); depth++; } if (depth > maxDepth) maxDepth = depth; syntaxDepthStats.addValue(depth); avgSyntaxDepthForSentenceStats.addValue(depth); int distance = Math .abs(arc.getHead().getToken().getIndex() - arc.getDependent().getToken().getIndex()); syntaxDistanceStats.addValue(distance); } maxSyntaxDepthStats.addValue(maxDepth); if (avgSyntaxDepthForSentenceStats.getN() > 0) avgSyntaxDepthStats.addValue(avgSyntaxDepthForSentenceStats.getMean()); } // we cheat a little bit by only allowing each arc to count once // there could be a situation where there are two independent non-projective arcs // crossing the same mother arc, but we prefer here to underestimate, // as this phenomenon is quite rare. Set<DependencyArc> nonProjectiveArcs = new HashSet<DependencyArc>(); int i = 0; for (DependencyArc arc : parseConfiguration.getDependencies()) { i++; if (arc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && (arc.getLabel() == null || arc.getLabel().length() == 0)) continue; if (nonProjectiveArcs.contains(arc)) continue; int headIndex = arc.getHead().getToken().getIndex(); int depIndex = arc.getDependent().getToken().getIndex(); int startIndex = headIndex < depIndex ? headIndex : depIndex; int endIndex = headIndex >= depIndex ? headIndex : depIndex; int j = 0; for (DependencyArc otherArc : parseConfiguration.getDependencies()) { j++; if (j <= i) continue; if (otherArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && (otherArc.getLabel() == null || otherArc.getLabel().length() == 0)) continue; if (nonProjectiveArcs.contains(otherArc)) continue; int headIndex2 = otherArc.getHead().getToken().getIndex(); int depIndex2 = otherArc.getDependent().getToken().getIndex(); int startIndex2 = headIndex2 < depIndex2 ? headIndex2 : depIndex2; int endIndex2 = headIndex2 >= depIndex2 ? headIndex2 : depIndex2; boolean nonProjective = false; if (startIndex2 < startIndex && endIndex2 > startIndex && endIndex2 < endIndex) { nonProjective = true; } else if (startIndex2 > startIndex && startIndex2 < endIndex && endIndex2 > endIndex) { nonProjective = true; } if (nonProjective) { nonProjectiveArcs.add(arc); nonProjectiveArcs.add(otherArc); nonProjectiveCount++; LOG.debug("Non-projective arcs in sentence: " + parseConfiguration.getSentence().getText()); LOG.debug(arc.toString()); LOG.debug(otherArc.toString()); break; } } } }
From source file:net.sf.mzmine.modules.peaklistmethods.dataanalysis.heatmaps.HeatMapTask.java
private double[][] groupingDataset(UserParameter selectedParameter, String referenceGroup) { // Collect all data files Vector<RawDataFile> allDataFiles = new Vector<RawDataFile>(); DescriptiveStatistics meanControlStats = new DescriptiveStatistics(); DescriptiveStatistics meanGroupStats = new DescriptiveStatistics(); allDataFiles.addAll(Arrays.asList(peakList.getRawDataFiles())); // Determine the reference group and non reference group (the rest of // the samples) for raw data files List<RawDataFile> referenceDataFiles = new ArrayList<RawDataFile>(); List<RawDataFile> nonReferenceDataFiles = new ArrayList<RawDataFile>(); List<String> groups = new ArrayList<String>(); MZmineProject project = MZmineCore.getCurrentProject(); for (RawDataFile rawDataFile : allDataFiles) { Object paramValue = project.getParameterValue(selectedParameter, rawDataFile); if (!groups.contains(String.valueOf(paramValue))) { groups.add(String.valueOf(paramValue)); }/*from w w w . ja v a 2s . co m*/ if (String.valueOf(paramValue).equals(referenceGroup)) { referenceDataFiles.add(rawDataFile); } else { nonReferenceDataFiles.add(rawDataFile); } } int numRows = 0; for (int row = 0; row < peakList.getNumberOfRows(); row++) { if (!onlyIdentified || (onlyIdentified && peakList.getRow(row).getPeakIdentities().length > 0)) { numRows++; } } // Create a new aligned peak list with all the samples if the reference // group has to be shown or with only // the non reference group if not. double[][] dataMatrix = new double[groups.size() - 1][numRows]; pValueMatrix = new String[groups.size() - 1][numRows]; // data files that should be in the heat map List<RawDataFile> shownDataFiles = nonReferenceDataFiles; for (int row = 0, rowIndex = 0; row < peakList.getNumberOfRows(); row++) { PeakListRow rowPeak = peakList.getRow(row); if (!onlyIdentified || (onlyIdentified && rowPeak.getPeakIdentities().length > 0)) { // Average area or height of the reference group meanControlStats.clear(); for (int column = 0; column < referenceDataFiles.size(); column++) { if (rowPeak.getPeak(referenceDataFiles.get(column)) != null) { if (area) { meanControlStats.addValue(rowPeak.getPeak(referenceDataFiles.get(column)).getArea()); } else { meanControlStats.addValue(rowPeak.getPeak(referenceDataFiles.get(column)).getHeight()); } } } // Divide the area or height of each peak by the average of the // area or height of the reference peaks in each row int columnIndex = 0; for (int column = 0; column < groups.size(); column++) { String group = groups.get(column); meanGroupStats.clear(); if (!group.equals(referenceGroup)) { for (int dataColumn = 0; dataColumn < shownDataFiles.size(); dataColumn++) { Object paramValue = project.getParameterValue(selectedParameter, shownDataFiles.get(dataColumn)); if (rowPeak.getPeak(shownDataFiles.get(dataColumn)) != null && String.valueOf(paramValue).equals(group)) { Feature peak = rowPeak.getPeak(shownDataFiles.get(dataColumn)); if (!Double.isInfinite(peak.getArea()) && !Double.isNaN(peak.getArea())) { if (area) { meanGroupStats.addValue(peak.getArea()); } else { meanGroupStats.addValue(peak.getHeight()); } } } } double value = meanGroupStats.getMean() / meanControlStats.getMean(); if (meanGroupStats.getN() > 1 && meanControlStats.getN() > 1) { pValueMatrix[columnIndex][rowIndex] = this.getPvalue(meanGroupStats, meanControlStats); } else { pValueMatrix[columnIndex][rowIndex] = ""; } if (log) { value = Math.log(value); } dataMatrix[columnIndex++][rowIndex] = value; } } rowIndex++; } } // Scale the data dividing the peak area/height by the standard // deviation of each column if (scale) { scale(dataMatrix); } // Create two arrays: row and column names rowNames = new String[dataMatrix[0].length]; colNames = new String[groups.size() - 1]; int columnIndex = 0; for (String group : groups) { if (!group.equals(referenceGroup)) { colNames[columnIndex++] = group; } } for (int row = 0, rowIndex = 0; row < peakList.getNumberOfRows(); row++) { if (!onlyIdentified || (onlyIdentified && peakList.getRow(row).getPeakIdentities().length > 0)) { if (peakList.getRow(row).getPeakIdentities() != null && peakList.getRow(row).getPeakIdentities().length > 0) { rowNames[rowIndex++] = peakList.getRow(row).getPreferredPeakIdentity().getName(); } else { rowNames[rowIndex++] = "Unknown"; } } } return dataMatrix; }
From source file:datafu.hourglass.jobs.StagedOutputJob.java
/** * Writes Hadoop counters and other task statistics to a file in the file system. * // w w w . j av a2 s. co m * @param fs * @throws IOException */ private void writeCounters(final FileSystem fs) throws IOException { final Path actualOutputPath = FileOutputFormat.getOutputPath(this); SimpleDateFormat timestampFormat = new SimpleDateFormat("yyyyMMddHHmmss"); String suffix = timestampFormat.format(new Date()); if (_countersParentPath != null) { if (!fs.exists(_countersParentPath)) { _log.info("Creating counter parent path " + _countersParentPath); fs.mkdirs(_countersParentPath, FsPermission.valueOf("-rwxrwxr-x")); } // make the name as unique as possible in this case because this may be a directory // where other counter files will be dropped _countersPath = new Path(_countersParentPath, ".counters." + suffix); } else { _countersPath = new Path(actualOutputPath, ".counters." + suffix); } _log.info(String.format("Writing counters to %s", _countersPath)); FSDataOutputStream counterStream = fs.create(_countersPath); BufferedOutputStream buffer = new BufferedOutputStream(counterStream, 256 * 1024); OutputStreamWriter writer = new OutputStreamWriter(buffer); for (String groupName : getCounters().getGroupNames()) { for (Counter counter : getCounters().getGroup(groupName)) { writeAndLog(writer, String.format("%s=%d", counter.getName(), counter.getValue())); } } JobID jobID = this.getJobID(); org.apache.hadoop.mapred.JobID oldJobId = new org.apache.hadoop.mapred.JobID(jobID.getJtIdentifier(), jobID.getId()); long minStart = Long.MAX_VALUE; long maxFinish = 0; long setupStart = Long.MAX_VALUE; long cleanupFinish = 0; DescriptiveStatistics mapStats = new DescriptiveStatistics(); DescriptiveStatistics reduceStats = new DescriptiveStatistics(); boolean success = true; JobClient jobClient = new JobClient(this.conf); Map<String, String> taskIdToType = new HashMap<String, String>(); TaskReport[] setupReports = jobClient.getSetupTaskReports(oldJobId); if (setupReports.length > 0) { _log.info("Processing setup reports"); for (TaskReport report : jobClient.getSetupTaskReports(oldJobId)) { taskIdToType.put(report.getTaskID().toString(), "SETUP"); if (report.getStartTime() == 0) { _log.warn("Skipping report with zero start time"); continue; } setupStart = Math.min(setupStart, report.getStartTime()); } } else { _log.error("No setup reports"); } TaskReport[] mapReports = jobClient.getMapTaskReports(oldJobId); if (mapReports.length > 0) { _log.info("Processing map reports"); for (TaskReport report : mapReports) { taskIdToType.put(report.getTaskID().toString(), "MAP"); if (report.getFinishTime() == 0 || report.getStartTime() == 0) { _log.warn("Skipping report with zero start or finish time"); continue; } minStart = Math.min(minStart, report.getStartTime()); mapStats.addValue(report.getFinishTime() - report.getStartTime()); } } else { _log.error("No map reports"); } TaskReport[] reduceReports = jobClient.getReduceTaskReports(oldJobId); if (reduceReports.length > 0) { _log.info("Processing reduce reports"); for (TaskReport report : reduceReports) { taskIdToType.put(report.getTaskID().toString(), "REDUCE"); if (report.getFinishTime() == 0 || report.getStartTime() == 0) { _log.warn("Skipping report with zero start or finish time"); continue; } maxFinish = Math.max(maxFinish, report.getFinishTime()); reduceStats.addValue(report.getFinishTime() - report.getStartTime()); } } else { _log.error("No reduce reports"); } TaskReport[] cleanupReports = jobClient.getCleanupTaskReports(oldJobId); if (cleanupReports.length > 0) { _log.info("Processing cleanup reports"); for (TaskReport report : cleanupReports) { taskIdToType.put(report.getTaskID().toString(), "CLEANUP"); if (report.getFinishTime() == 0) { _log.warn("Skipping report with finish time of zero"); continue; } cleanupFinish = Math.max(cleanupFinish, report.getFinishTime()); } } else { _log.error("No cleanup reports"); } if (minStart == Long.MAX_VALUE) { _log.error("Could not determine map-reduce start time"); success = false; } if (maxFinish == 0) { _log.error("Could not determine map-reduce finish time"); success = false; } if (setupStart == Long.MAX_VALUE) { _log.error("Could not determine setup start time"); success = false; } if (cleanupFinish == 0) { _log.error("Could not determine cleanup finish time"); success = false; } // Collect statistics on successful/failed/killed task attempts, categorized by setup/map/reduce/cleanup. // Unfortunately the job client doesn't have an easier way to get these statistics. Map<String, Integer> attemptStats = new HashMap<String, Integer>(); _log.info("Processing task attempts"); for (TaskCompletionEvent event : getTaskCompletionEvents(jobClient, oldJobId)) { String type = taskIdToType.get(event.getTaskAttemptId().getTaskID().toString()); String status = event.getTaskStatus().toString(); String key = String.format("%s_%s_ATTEMPTS", status, type); if (!attemptStats.containsKey(key)) { attemptStats.put(key, 0); } attemptStats.put(key, attemptStats.get(key) + 1); } if (success) { writeAndLog(writer, String.format("SETUP_START_TIME_MS=%d", setupStart)); writeAndLog(writer, String.format("CLEANUP_FINISH_TIME_MS=%d", cleanupFinish)); writeAndLog(writer, String.format("COMPLETE_WALL_CLOCK_TIME_MS=%d", cleanupFinish - setupStart)); writeAndLog(writer, String.format("MAP_REDUCE_START_TIME_MS=%d", minStart)); writeAndLog(writer, String.format("MAP_REDUCE_FINISH_TIME_MS=%d", maxFinish)); writeAndLog(writer, String.format("MAP_REDUCE_WALL_CLOCK_TIME_MS=%d", maxFinish - minStart)); writeAndLog(writer, String.format("MAP_TOTAL_TASKS=%d", (long) mapStats.getN())); writeAndLog(writer, String.format("MAP_MAX_TIME_MS=%d", (long) mapStats.getMax())); writeAndLog(writer, String.format("MAP_MIN_TIME_MS=%d", (long) mapStats.getMin())); writeAndLog(writer, String.format("MAP_AVG_TIME_MS=%d", (long) mapStats.getMean())); writeAndLog(writer, String.format("MAP_STD_TIME_MS=%d", (long) mapStats.getStandardDeviation())); writeAndLog(writer, String.format("MAP_SUM_TIME_MS=%d", (long) mapStats.getSum())); writeAndLog(writer, String.format("REDUCE_TOTAL_TASKS=%d", (long) reduceStats.getN())); writeAndLog(writer, String.format("REDUCE_MAX_TIME_MS=%d", (long) reduceStats.getMax())); writeAndLog(writer, String.format("REDUCE_MIN_TIME_MS=%d", (long) reduceStats.getMin())); writeAndLog(writer, String.format("REDUCE_AVG_TIME_MS=%d", (long) reduceStats.getMean())); writeAndLog(writer, String.format("REDUCE_STD_TIME_MS=%d", (long) reduceStats.getStandardDeviation())); writeAndLog(writer, String.format("REDUCE_SUM_TIME_MS=%d", (long) reduceStats.getSum())); writeAndLog(writer, String.format("MAP_REDUCE_SUM_TIME_MS=%d", (long) mapStats.getSum() + (long) reduceStats.getSum())); for (Map.Entry<String, Integer> attemptStat : attemptStats.entrySet()) { writeAndLog(writer, String.format("%s=%d", attemptStat.getKey(), attemptStat.getValue())); } } writer.close(); buffer.close(); counterStream.close(); }
From source file:de.tudarmstadt.ukp.experiments.argumentation.sequence.feature.coreference.CoreferenceFeatures.java
@Override protected List<Feature> extract(JCas jCas, Sentence sentence, String sentencePrefix) throws TextClassificationException { List<List<CoreferenceLink>> coreferenceChains = extractCoreferenceChains(jCas); FrequencyDistribution<String> featuresAcrossAllChains = new FrequencyDistribution<>(); DescriptiveStatistics chainLength = new DescriptiveStatistics(); DescriptiveStatistics distanceToPreviousSentence = new DescriptiveStatistics(); DescriptiveStatistics distanceToNextSentence = new DescriptiveStatistics(); DescriptiveStatistics interSentencesCorLinks = new DescriptiveStatistics(); for (List<CoreferenceLink> chain : coreferenceChains) { SortedMap<Integer, List<CoreferenceLink>> sentencesAndLinks = extractSentencesAndLinksFromChain(chain, jCas);/* w w w .j ava2s . c o m*/ int currentSentencePos = getCurrentSentencePos(jCas, sentence); log.debug(sentencesAndLinks.keySet() + ", current " + currentSentencePos); // is the sentence in chain that spans more sentences? boolean partOfChain = sentencesAndLinks.containsKey(currentSentencePos) && sentencesAndLinks.size() > 1; // is part of a chain? if (partOfChain) { log.debug(chainToString(chain)); featuresAcrossAllChains.inc(FN_PART_OF_CHAIN); // starts the chain? if (sentencesAndLinks.firstKey().equals(currentSentencePos)) { featuresAcrossAllChains.inc(FN_STARTS_THE_CHAIN); } else if (sentencesAndLinks.lastKey().equals(currentSentencePos)) { // ends the chain? featuresAcrossAllChains.inc(FN_ENDS_THE_CHAIN); } else { // in the middle of chain? featuresAcrossAllChains.inc(FN_IN_THE_MIDDLE_OF_CHAIN); } // length of the chain chainLength.addValue(sentencesAndLinks.size()); List<CoreferenceLink> currentSentenceLinks = sentencesAndLinks.get(currentSentencePos); CoreferenceLink currentSentenceFirstLink = currentSentenceLinks.get(0); CoreferenceLink currentSentenceLastLink = currentSentenceLinks.get(currentSentenceLinks.size() - 1); // transition to the previous link, i.e. NOMINAL -> PRONOMINAL if (!sentencesAndLinks.firstKey().equals(currentSentencePos)) { // find the previous sentence List<CoreferenceLink> previousSentenceLinks = null; int prevSentNo = currentSentencePos; while (previousSentenceLinks == null && prevSentNo >= 0) { prevSentNo--; if (sentencesAndLinks.containsKey(prevSentNo)) { previousSentenceLinks = sentencesAndLinks.get(prevSentNo); } } if (previousSentenceLinks == null) { throw new IllegalStateException("Oops :))"); } // distance to previous sentence distanceToPreviousSentence.addValue(currentSentencePos - prevSentNo); // get the last link from the previous sentence CoreferenceLink prevSentenceLastLink = previousSentenceLinks .get(previousSentenceLinks.size() - 1); // add type type transition String prevSentenceLastLinkReferenceType = prevSentenceLastLink.getReferenceType(); String currentSentenceFirstLinkReferenceType = currentSentenceFirstLink.getReferenceType(); String transitionType = prevSentenceLastLinkReferenceType + GLUE + currentSentenceFirstLinkReferenceType; featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TYPE_TYPE + transitionType, 1); // add token - type transition String glueCoreferenceCurrentSentence = glueCoreferenceLinkTokens(currentSentenceFirstLink); String typeToken = prevSentenceLastLinkReferenceType + GLUE + glueCoreferenceCurrentSentence; featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TYPE_TOKEN + typeToken, 1); // add type - token transition String glueCoreferencePrevSentence = glueCoreferenceLinkTokens(prevSentenceLastLink); String tokenType = glueCoreferencePrevSentence + GLUE + currentSentenceFirstLinkReferenceType; featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TOKEN_TYPE + tokenType, 1); // add token token transition String tokenToken = glueCoreferencePrevSentence + GLUE + glueCoreferenceCurrentSentence; featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TOKEN_TOKEN + tokenToken, 1); // exact matching token-token reference? if (glueCoreferencePrevSentence.equals(glueCoreferenceCurrentSentence)) { featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TOKEN_TOKEN_MATCH, 1); } } // transition to the previous link, i.e. NOMINAL -> PRONOMINAL if (!sentencesAndLinks.lastKey().equals(currentSentencePos)) { // find the previous sentence List<CoreferenceLink> nextSentenceLinks = null; int nextSentNo = currentSentencePos; while (nextSentenceLinks == null && nextSentNo <= sentencesAndLinks.lastKey()) { nextSentNo++; if (sentencesAndLinks.containsKey(nextSentNo)) { nextSentenceLinks = sentencesAndLinks.get(nextSentNo); } } if (nextSentenceLinks == null) { throw new IllegalStateException("Oops :))"); } // distance to next sentence distanceToNextSentence.addValue(nextSentNo - currentSentencePos); // get the last link from the previous sentence CoreferenceLink nextSentenceFirstLink = nextSentenceLinks.get(0); // add type type transition String currentSentenceLastLinkReferenceType = currentSentenceLastLink.getReferenceType(); String nextSentenceFirstLinkReferenceType = nextSentenceFirstLink.getReferenceType(); String transitionType = currentSentenceLastLinkReferenceType + GLUE + nextSentenceFirstLinkReferenceType; featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TYPE_TYPE + transitionType, 1); // add token - type transition String glueCoreferenceCurrentSent = glueCoreferenceLinkTokens(currentSentenceLastLink); String typeToken = glueCoreferenceCurrentSent + GLUE + nextSentenceFirstLinkReferenceType; featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TOKEN_TYPE + typeToken, 1); // add type - token transition String glueCoreferenceNextSent = glueCoreferenceLinkTokens(nextSentenceFirstLink); String tokenType = currentSentenceLastLinkReferenceType + GLUE + glueCoreferenceNextSent; featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TYPE_TOKEN + tokenType, 1); // add token token transition String tokenToken = glueCoreferenceCurrentSent + GLUE + glueCoreferenceNextSent; featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TOKEN_TOKEN + tokenToken, 1); // exact matching token-token reference? if (glueCoreferenceNextSent.equals(glueCoreferenceCurrentSent)) { featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TOKEN_TOKEN_MATCH, 1); } } } // number of inter-sentence coreference links if (sentencesAndLinks.containsKey(currentSentencePos)) { int coreferenceLinks = sentencesAndLinks.get(currentSentencePos).size(); interSentencesCorLinks.addValue(coreferenceLinks); } /* List<Integer> positions = positionsOfSentenceInCurrentChain(chain, sentence); // ok, we're in a chain if (!positions.isEmpty()) { log.debug(printChain(chain)); log.debug(sentence.getCoveredText()); log.debug(positions); Integer lastPosition = positions.get(positions.size() - 1); Integer firstPosition = positions.get(0); if (lastPosition == positions.size() - 1) { log.debug("Last sentence of chain"); } log.debug("-----"); } */ } List<Feature> result = new ArrayList<>(); log.debug(featuresAcrossAllChains); if (distanceToNextSentence.getN() > 0) { log.debug("Next:" + distanceToNextSentence); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_NEXT_MIN, distanceToNextSentence.getMin())); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_NEXT_MAX, distanceToNextSentence.getMax())); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_NEXT_AVG, distanceToNextSentence.getMean())); } if (distanceToPreviousSentence.getN() > 0) { log.debug("Prev: " + distanceToPreviousSentence); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_PREV_MIN, distanceToPreviousSentence.getMin())); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_PREV_MAX, distanceToPreviousSentence.getMax())); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_PREV_AVG, distanceToPreviousSentence.getMean())); } if (interSentencesCorLinks.getN() > 0) { result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_INTER_SENT_COR_MIN, interSentencesCorLinks.getMin())); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_INTER_SENT_COR_MAX, interSentencesCorLinks.getMax())); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_INTER_SENT_COR_AVG, interSentencesCorLinks.getMean())); } log.debug("----"); for (String feat : featuresAcrossAllChains.getKeys()) { // binary result.add(new Feature(sentencePrefix + FEATURE_NAME + feat, 1)); } return result; }