List of usage examples for org.apache.commons.math.stat.descriptive DescriptiveStatistics getMean
public double getMean()
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
/** * Detects paragraph splits and assign rows to correct paragraphs. * @param sourceImage/*ww w .j a va 2s . c om*/ */ void groupRowsIntoParagraphs(SourceImage sourceImage) { LOG.debug("########## groupRowsIntoParagraphs #########"); // We'll use various possible indicators, including // indented start, indented end, and spacing between rows. // On pages with a single big paragraph makes it hypersensitive to differences in row-start/row-end // This means we cannot use deviation. Instead, we use the average shape width on the page. // We also adjust maxLeft & minRight to match the vertical line slope // This is now complicated by the possibility of multiple columns // Need to take into account a big horizontal space - Pietrushka page 14 // Find horizontal spaces that go all the way across and are wider than a certain threshold // simply do a boolean column and black out everything in a row, than see if there are any remaining spaces above a certain threshold // Columns are thus arranged into "areas", separated by white-space. boolean[] fullRows = new boolean[sourceImage.getHeight()]; for (RowOfShapes row : sourceImage.getRows()) { for (int y = row.getTop(); y <= row.getBottom(); y++) { fullRows[y] = true; } } DescriptiveStatistics rowHeightStats = new DescriptiveStatistics(); for (RowOfShapes row : sourceImage.getRows()) { int height = row.getXHeight(); rowHeightStats.addValue(height); } double avgRowHeight = rowHeightStats.getPercentile(50); LOG.debug("meanRowHeight: " + avgRowHeight); double minHeightForWhiteSpace = avgRowHeight * 1.3; LOG.debug("minHeightForWhiteSpace: " + minHeightForWhiteSpace); // find the "white rows" - any horizontal white space // in the page which is sufficiently high List<int[]> whiteRows = new ArrayList<int[]>(); boolean inWhite = false; int startWhite = 0; for (int y = 0; y < sourceImage.getHeight(); y++) { if (!inWhite && !fullRows[y]) { inWhite = true; startWhite = y; } else if (inWhite && fullRows[y]) { int length = y - startWhite; if (length > minHeightForWhiteSpace) { LOG.debug("Adding whiteRow " + startWhite + "," + (y - 1)); whiteRows.add(new int[] { startWhite, y - 1 }); } inWhite = false; } } if (inWhite) whiteRows.add(new int[] { startWhite, sourceImage.getHeight() - 1 }); whiteRows.add(new int[] { sourceImage.getHeight(), sourceImage.getHeight() }); // place rows in "areas" defined by the "white rows" found above List<List<RowOfShapes>> areas = new ArrayList<List<RowOfShapes>>(); int startY = -1; for (int[] whiteRow : whiteRows) { List<RowOfShapes> area = new ArrayList<RowOfShapes>(); for (RowOfShapes row : sourceImage.getRows()) { if (row.getTop() >= startY && row.getBottom() <= whiteRow[0]) { area.add(row); } } if (area.size() > 0) { areas.add(area); } startY = whiteRow[1]; } // break up each area into vertical columns LOG.debug("break up each area into vertical columns"); List<Column> columns = new ArrayList<Column>(); List<List<Column>> columnsPerAreaList = new ArrayList<List<Column>>(); for (List<RowOfShapes> area : areas) { LOG.debug("Next area"); List<Column> columnsPerArea = new ArrayList<SegmenterImpl.Column>(); columnsPerAreaList.add(columnsPerArea); TreeSet<RowOfShapes> rows = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator()); rows.addAll(area); for (RowOfShapes row : rows) { // try to place this row in one of the columns directly above it. // this means that a row which overlaps more than one column has to "close" this column, so it is no longer considered List<Column> overlappingColumns = new ArrayList<Column>(); for (Column column : columnsPerArea) { if (!column.closed) { RowOfShapes lastRowInColumn = column.get(column.size() - 1); if (row.getRight() - row.getXAdjustment() >= lastRowInColumn.getLeft() - lastRowInColumn.getXAdjustment() && row.getLeft() - row.getXAdjustment() <= lastRowInColumn.getRight() - lastRowInColumn.getXAdjustment()) { overlappingColumns.add(column); } } } if (overlappingColumns.size() == 1) { Column myColumn = overlappingColumns.get(0); RowOfShapes lastRowInMyColumn = myColumn.get(0); // close any columns that are now at a distance of more than one row for (Column column : columnsPerArea) { if (!column.closed && !column.equals(myColumn)) { RowOfShapes lastRowInColumn = column.get(column.size() - 1); if (lastRowInMyColumn.getTop() > lastRowInColumn.getBottom()) { column.closed = true; LOG.debug("Closing distant column " + lastRowInColumn); } } } myColumn.add(row); LOG.debug(row.toString()); LOG.debug(" added to column " + lastRowInMyColumn); } else { for (Column overlappingColumn : overlappingColumns) { overlappingColumn.closed = true; RowOfShapes lastRowInColumn = overlappingColumn.get(overlappingColumn.size() - 1); LOG.debug("Closing overlapping column " + lastRowInColumn); } Column myColumn = new Column(sourceImage); myColumn.add(row); LOG.debug("Found new column"); LOG.debug(row.toString()); columns.add(myColumn); columnsPerArea.add(myColumn); } } } // next area for (Column column : columns) column.recalculate(); // Intermediate step to reform the vertical columns, if they exist // basically the idea is that if the columns are aligned vertically, then the thresholds for paragraph indents // should be shared, to increase the statistical sample size and reduce anomalies. // We'll assume that two columns from two consecutive areas are in the same vertical group if they overlap with each other horizontally // and don't overlap with any other column in the other column's area. List<List<Column>> columnGroups = new ArrayList<List<Column>>(); List<Column> columnsInPrevArea = null; for (List<Column> columnsPerArea : columnsPerAreaList) { if (columnsInPrevArea != null) { for (Column prevColumn : columnsInPrevArea) { LOG.debug("Checking " + prevColumn); // find the column group containing the previous column List<Column> myColumnGroup = null; for (List<Column> columnGroup : columnGroups) { if (columnGroup.contains(prevColumn)) { myColumnGroup = columnGroup; break; } } if (myColumnGroup == null) { myColumnGroup = new ArrayList<SegmenterImpl.Column>(); LOG.debug("Creating column group for column " + prevColumn.toString()); columnGroups.add(myColumnGroup); myColumnGroup.add(prevColumn); } // does only one column overlap with this one? Column overlappingColumn = null; for (Column column : columnsPerArea) { if (column.adjustedRight >= prevColumn.adjustedLeft && column.adjustedLeft <= prevColumn.adjustedRight) { if (overlappingColumn == null) { LOG.debug("I overlap with " + column); overlappingColumn = column; } else { LOG.debug("But I overlap also with " + column); overlappingColumn = null; break; } } } if (overlappingColumn != null) { // does it overlap with only me? for (Column otherPrevColumn : columnsInPrevArea) { if (otherPrevColumn.equals(prevColumn)) continue; if (overlappingColumn.adjustedRight >= otherPrevColumn.adjustedLeft && overlappingColumn.adjustedLeft <= otherPrevColumn.adjustedRight) { LOG.debug("But it overlaps also with " + otherPrevColumn); overlappingColumn = null; break; } } } if (overlappingColumn != null) { myColumnGroup.add(overlappingColumn); LOG.debug("Adding " + overlappingColumn); LOG.debug(" to group with " + prevColumn); } } // next previous column } // have previous columns columnsInPrevArea = columnsPerArea; } // next area if (columnsInPrevArea != null) { for (Column prevColumn : columnsInPrevArea) { // find the column group containing the previous column List<Column> myColumnGroup = null; for (List<Column> columnGroup : columnGroups) { if (columnGroup.contains(prevColumn)) { myColumnGroup = columnGroup; break; } } if (myColumnGroup == null) { myColumnGroup = new ArrayList<SegmenterImpl.Column>(); LOG.debug("Creating column group for column " + prevColumn.toString()); columnGroups.add(myColumnGroup); myColumnGroup.add(prevColumn); } } } // What we really want here is, for each column (in the case of right-to-left), // two clusters on the right // and one relatively big cluster on the left. // anything outside of the cluster on the left is an EOP. boolean hasTab = false; for (List<Column> columnGroup : columnGroups) { LOG.debug("Next column group"); double averageShapeWidth = sourceImage.getAverageShapeWidth(); LOG.debug("averageShapeWidth: " + averageShapeWidth); double epsilon = averageShapeWidth / 2.0; LOG.debug("epsilon: " + epsilon); int columnGroupTop = sourceImage.getHeight(); int columnGroupBottom = 0; int columnGroupLeft = sourceImage.getWidth(); int columnGroupRight = 0; for (Column column : columnGroup) { if (column.top < columnGroupTop) columnGroupTop = (int) Math.round(column.top); if (column.bottom > columnGroupBottom) columnGroupBottom = (int) Math.round(column.bottom); if (column.adjustedLeft < columnGroupLeft) columnGroupLeft = (int) Math.round(column.adjustedLeft); if (column.adjustedRight > columnGroupRight) columnGroupRight = (int) Math.round(column.adjustedRight); } // right thresholds LOG.debug("Calculating right thresholds"); // first, create a DBScan cluster of all rows by their adjusted right coordinate List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>(); List<double[]> rightCoordinates = new ArrayList<double[]>(); for (Column column : columnGroup) { for (RowOfShapes row : column) { double right = row.getRight() - row.getXAdjustment(); // double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage); // if (rightOverlap==0) { // // leave out any right-overlapping rows here // // since we need accurate statistics for margin detection // // This is questionable - especially since a long vertical bar (see Petriushka) // // tends to give all rows a left overlap. Also, because the overlap is calculated based // // on the mean right & mean left, not based on any sort of margin clusters. // rightHandRows.add(row); // rightCoordinates.add(new double[] {right}); // } rightHandRows.add(row); rightCoordinates.add(new double[] { right }); } } int minCardinalityForRightMargin = 5; DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows, rightCoordinates); Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(epsilon, minCardinalityForRightMargin, true); TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedRowClusters.addAll(rowClusters); int i = 0; // find the two right-most clusters, and assume they are the margin & the tab DescriptiveStatistics rightMarginStats = null; DescriptiveStatistics rightTabStats = null; for (Set<RowOfShapes> cluster : orderedRowClusters) { DescriptiveStatistics rightStats = new DescriptiveStatistics(); MeanAbsoluteDeviation rightDev = new MeanAbsoluteDeviation(); for (RowOfShapes row : cluster) { int rowIndex = rightHandRows.indexOf(row); double right = rightCoordinates.get(rowIndex)[0]; rightStats.addValue(right); rightDev.increment(right); } LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Right mean : " + rightStats.getMean()); LOG.debug("Right dev: " + rightDev.getResult()); if (cluster.size() >= minCardinalityForRightMargin) { if (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean()) { if (rightMarginStats != null) rightTabStats = rightMarginStats; rightMarginStats = rightStats; } else if (rightTabStats == null || rightTabStats.getMean() < rightStats.getMean()) { rightTabStats = rightStats; } } else { break; } i++; } // next right-coordinate cluster double rightMargin = sourceImage.getWidth(); double rightTab = sourceImage.getWidth(); if (rightMarginStats != null) { rightMargin = rightMarginStats.getMean(); } else { List<Rectangle> columnSeparators = sourceImage.findColumnSeparators(); for (Rectangle columnSeparator : columnSeparators) { if (columnSeparator.getTop() <= columnGroupTop && columnSeparator.getBottom() >= columnGroupBottom && columnSeparator.getLeft() >= columnGroupRight) { if (columnSeparator.getLeft() < rightMargin) rightMargin = columnSeparator.getLeft(); } } } if (rightTabStats != null) { rightTab = rightTabStats.getMean(); } LOG.debug("rightMargin: " + rightMargin); LOG.debug("rightTab: " + rightTab); // left thresholds LOG.debug("Calculating left thresholds"); // first, create a DBScan cluster of all rows by their adjusted left coordinate List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>(); List<double[]> leftCoordinates = new ArrayList<double[]>(); for (Column column : columnGroup) { for (RowOfShapes row : column) { double left = row.getLeft() - row.getXAdjustment(); // double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage); // if (leftOverlap == 0) { // // leave out any overlapping rows from margin calcs, // // since we need accurate statistics here // leftHandRows.add(row); // leftCoordinates.add(new double[] {left}); // } leftHandRows.add(row); leftCoordinates.add(new double[] { left }); } } int minCardinalityForLeftMargin = 5; DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows, leftCoordinates); Set<Set<RowOfShapes>> leftRowClusters = leftMarginClusterer.cluster(epsilon, minCardinalityForLeftMargin, true); TreeSet<Set<RowOfShapes>> orderedLeftRowClusters = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedLeftRowClusters.addAll(leftRowClusters); i = 0; // find the two left-most clusters, and assume they are the margin & the tab DescriptiveStatistics leftMarginStats = null; DescriptiveStatistics leftTabStats = null; for (Set<RowOfShapes> cluster : orderedLeftRowClusters) { DescriptiveStatistics leftStats = new DescriptiveStatistics(); MeanAbsoluteDeviation leftDev = new MeanAbsoluteDeviation(); for (RowOfShapes row : cluster) { int rowIndex = leftHandRows.indexOf(row); double left = leftCoordinates.get(rowIndex)[0]; leftStats.addValue(left); leftDev.increment(left); } LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Left mean : " + leftStats.getMean()); LOG.debug("Left dev: " + leftDev.getResult()); if (cluster.size() >= minCardinalityForLeftMargin) { if (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean()) { if (leftMarginStats != null) leftTabStats = leftMarginStats; leftMarginStats = leftStats; } else if (leftTabStats == null || leftTabStats.getMean() > leftStats.getMean()) { leftTabStats = leftStats; } } else { break; } i++; } // next left-coordinate cluster double leftMargin = 0; double leftTab = 0; if (leftMarginStats != null) { leftMargin = leftMarginStats.getMean(); } else { List<Rectangle> columnSeparators = sourceImage.findColumnSeparators(); for (Rectangle columnSeparator : columnSeparators) { if (columnSeparator.getTop() <= columnGroupTop && columnSeparator.getBottom() >= columnGroupBottom && columnSeparator.getRight() <= columnGroupLeft) { if (columnSeparator.getRight() > leftMargin) leftMargin = columnSeparator.getRight(); } } } if (leftTabStats != null) { leftTab = leftTabStats.getMean(); } LOG.debug("leftMargin: " + leftMargin); LOG.debug("leftTab: " + leftTab); for (Column column : columnGroup) { if (sourceImage.isLeftToRight()) { column.startMargin = leftMargin; if (leftTabStats != null) { column.startTab = leftTab; column.hasTab = true; } else { LOG.debug("No left tab - setting based on left margin"); column.startTab = leftMargin + (5.0 * sourceImage.getAverageShapeWidth()); column.hasTab = false; } column.endMargin = rightMargin; } else { column.startMargin = rightMargin; if (rightTabStats != null) { column.startTab = rightTab; column.hasTab = true; } else { LOG.debug("No right tab - setting based on right margin"); column.startTab = rightMargin - (5.0 * sourceImage.getAverageShapeWidth()); column.hasTab = false; } column.endMargin = leftMargin; } LOG.debug("Margins for " + column); LOG.debug("startMargin: " + column.startMargin); LOG.debug("startTab: " + column.startTab); LOG.debug("endMargin: " + column.endMargin); } // next column } // next column group LOG.debug("hasTab: " + hasTab); double safetyMargin = 1.5 * sourceImage.getAverageShapeWidth(); // Now, paragraphs are either "indented", "outdented" or not "dented" at all (no tabs). // This applies to the entire page. // To recognise indenting vs. outdenting, we have to see if the row preceding each // indent/outdent is full or partial. In the case of indentation, partial rows will // typically be followed by an indent. In the case of outdentation, partial rows will // typically be followed by an outdent. boolean isIndented = true; int indentCount = 0; int outdentCount = 0; for (List<Column> columnGroup : columnGroups) { LOG.debug("Next column group"); boolean prevRowPartial = false; for (Column column : columnGroup) { if (column.hasTab) { for (RowOfShapes row : column) { if (sourceImage.isLeftToRight()) { if (prevRowPartial) { if (row.getLeft() - row.getXAdjustment() > column.startTab - safetyMargin) { indentCount++; } else if (row.getLeft() - row.getXAdjustment() < column.startMargin + safetyMargin) { outdentCount++; } } if (row.getRight() - row.getXAdjustment() < column.endMargin - safetyMargin) { prevRowPartial = true; } else { prevRowPartial = false; } } else { if (prevRowPartial) { if (row.getRight() - row.getXAdjustment() < column.startTab + safetyMargin) { indentCount++; } else if (row.getRight() - row.getXAdjustment() > column.startMargin - safetyMargin) { outdentCount++; } } if (row.getLeft() - row.getXAdjustment() > column.endMargin + safetyMargin) { prevRowPartial = true; } else { prevRowPartial = false; } } // left-to-right? } // next row } // column has tab } // next column } // next column group isIndented = (indentCount + 2 >= outdentCount); LOG.debug("indentCount: " + indentCount); LOG.debug("outdentCount: " + outdentCount); LOG.debug("isIndented: " + isIndented); // order the columns TreeSet<Column> orderedColumns = new TreeSet<SegmenterImpl.Column>(columns); columns.clear(); columns.addAll(orderedColumns); // find the paragraphs found in each column for (Column column : columns) { LOG.debug("--- Next column ---"); // break up the column into paragraphs Paragraph paragraph = null; RowOfShapes previousRow = null; int maxShapesForStandaloneParagraph = 2; List<RowOfShapes> rowsForStandaloneParagraphs = new ArrayList<RowOfShapes>(); Point2D previousPointStartMargin = null; Point2D previousPointStartTab = null; Point2D previousPointEndMargin = null; for (RowOfShapes row : column) { boolean rowForStandaloneParagraph = false; boolean newParagraph = false; if (row.getShapes().size() <= maxShapesForStandaloneParagraph) { rowsForStandaloneParagraphs.add(row); rowForStandaloneParagraph = true; } else { double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage); double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage); if (drawSegmentation) { double rowVerticalMidPoint = row.getBaseLineMiddlePoint(); double startMarginX = column.startMargin + row.getXAdjustment(); double startTabX = column.startTab + row.getXAdjustment(); double endMarginX = column.endMargin + row.getXAdjustment(); if (sourceImage.isLeftToRight()) { startMarginX += safetyMargin; startTabX -= safetyMargin; endMarginX -= safetyMargin; startMarginX += leftOverlap; startTabX += leftOverlap; endMarginX -= rightOverlap; } else { startMarginX -= safetyMargin; startTabX += safetyMargin; endMarginX += safetyMargin; startMarginX -= rightOverlap; startTabX -= rightOverlap; endMarginX += leftOverlap; } Point2D.Double currentPointStartMargin = new Point2D.Double(startMarginX, rowVerticalMidPoint); Point2D.Double currentPointStartTab = new Point2D.Double(startTabX, rowVerticalMidPoint); Point2D.Double currentPointEndMargin = new Point2D.Double(endMarginX, rowVerticalMidPoint); if (previousPointStartMargin != null) { graphics2D.setStroke(new BasicStroke(1)); graphics2D.setPaint(Color.BLUE); graphics2D.drawLine((int) Math.round(previousPointStartMargin.getX()), (int) Math.round(previousPointStartMargin.getY()), (int) Math.round(currentPointStartMargin.getX()), (int) Math.round(currentPointStartMargin.getY())); graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()), (int) Math.round(previousPointEndMargin.getY()), (int) Math.round(currentPointEndMargin.getX()), (int) Math.round(currentPointEndMargin.getY())); graphics2D.setPaint(Color.RED); graphics2D.drawLine((int) Math.round(previousPointStartTab.getX()), (int) Math.round(previousPointStartTab.getY()), (int) Math.round(currentPointStartTab.getX()), (int) Math.round(currentPointStartTab.getY())); graphics2D.setPaint(Color.RED); graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()), (int) Math.round(previousPointEndMargin.getY()), (int) Math.round(currentPointEndMargin.getX()), (int) Math.round(currentPointEndMargin.getY())); } previousPointStartMargin = currentPointStartMargin; previousPointStartTab = currentPointStartTab; previousPointEndMargin = currentPointEndMargin; } if (previousRow == null) { LOG.debug("New paragraph (first)"); newParagraph = true; } else { if (sourceImage.isLeftToRight()) { if (previousRow.getRight() - previousRow.getXAdjustment() - rightOverlap < column.endMargin - safetyMargin) { LOG.debug("New paragraph (previous EOP)"); newParagraph = true; } else if (column.hasTab && isIndented && row.getLeft() - row.getXAdjustment() + leftOverlap > column.startTab - safetyMargin) { LOG.debug("New paragraph (indent)"); newParagraph = true; } else if (column.hasTab && !isIndented && row.getLeft() - row.getXAdjustment() + leftOverlap < column.startMargin + safetyMargin) { LOG.debug("New paragraph (outdent)"); newParagraph = true; } } else { if (previousRow.getLeft() - previousRow.getXAdjustment() + leftOverlap > column.endMargin + safetyMargin) { LOG.debug("New paragraph (previous EOP)"); newParagraph = true; } else if (column.hasTab && isIndented && row.getRight() - row.getXAdjustment() - rightOverlap < column.startTab + safetyMargin) { LOG.debug("New paragraph (indent)"); newParagraph = true; } else if (column.hasTab && !isIndented && row.getRight() - row.getXAdjustment() - rightOverlap > column.startMargin - safetyMargin) { LOG.debug("New paragraph (outdent)"); newParagraph = true; } } // left-to-right? } // have previous row } // standalone paragraph? if (!rowForStandaloneParagraph) LOG.debug(row.toString()); if (newParagraph) { if (rowsForStandaloneParagraphs.size() > 0) { for (RowOfShapes oneRow : rowsForStandaloneParagraphs) { LOG.debug("Standalone paragraph"); LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")"); Paragraph standaloneParagraph = sourceImage.newParagraph(); standaloneParagraph.getRows().add(oneRow); } rowsForStandaloneParagraphs.clear(); } paragraph = sourceImage.newParagraph(); } //LOG.debug("Row: left(" + row.getLeft() + "), right(" + row.getRight() + "), width(" + (row.getRight() - row.getLeft() + 1) + ")"); if (!rowForStandaloneParagraph) { paragraph.getRows().add(row); previousRow = row; } } // next row in column if (rowsForStandaloneParagraphs.size() > 0) { for (RowOfShapes oneRow : rowsForStandaloneParagraphs) { LOG.debug("Standalone paragraph"); LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")"); Paragraph standaloneParagraph = sourceImage.newParagraph(); standaloneParagraph.getRows().add(oneRow); } rowsForStandaloneParagraphs.clear(); } } // next column }
From source file:com.joliciel.csvLearner.CSVLearner.java
private void doCommandEvaluate() throws IOException { if (resultFilePath == null) throw new RuntimeException("Missing argument: resultFile"); if (featureDir == null) throw new RuntimeException("Missing argument: featureDir"); if (testIdFilePath != null) { if (crossValidation) throw new RuntimeException("Cannot combine testIdFile with cross validation"); if (testSegment >= 0) { throw new RuntimeException("Cannot combine testIdFile with test segment"); }/* w ww . j a v a2 s. com*/ } if (!crossValidation && testIdFilePath == null) { if (testSegment < 0) throw new RuntimeException("Missing argument: testSegment"); if (testSegment > 9) throw new RuntimeException("testSegment must be an integer between 0 and 9"); } if (outDirPath == null) throw new RuntimeException("Missing argument: outDir"); LOG.info("Generating event list from CSV files..."); CSVEventListReader reader = this.getReader(TrainingSetType.TEST_SEGMENT, false); GenericEvents events = reader.getEvents(); File outDir = new File(outDirPath); outDir.mkdirs(); String fileBase = this.featureDir.replace('/', '_'); fileBase = fileBase.replace(':', '_'); fileBase = fileBase + "_cutoff" + cutoff; if (generateEventFile) { File eventFile = new File(outDir, fileBase + "_events.txt"); this.generateEventFile(eventFile, events); } File fscoreFile = new File(outDir, fileBase + "_fscores.csv"); Writer fscoreFileWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(fscoreFile, false), "UTF8")); File outcomeFile = new File(outDir, fileBase + "_outcomes.csv"); Writer outcomeFileWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outcomeFile, false), "UTF8")); try { if (!crossValidation) { MaxentModel maxentModel = this.train(events, null); this.evaluate(maxentModel, events, fscoreFileWriter, outcomeFileWriter); } else { DescriptiveStatistics accuracyStats = new DescriptiveStatistics(); Map<String, DescriptiveStatistics[]> outcomeFscoreStats = new TreeMap<String, DescriptiveStatistics[]>(); for (int segment = 0; segment <= 9; segment++) { outcomeFileWriter.write("Run " + segment + ",\n"); fscoreFileWriter.write("Run " + segment + ",\n"); if (balanceOutcomes) { for (String outcome : reader.getOutcomes()) { int i = 0; for (GenericEvent event : events) { if (event.getOutcome().equals(outcome)) { boolean test = i % 10 == segment; event.setTest(test); i++; } } } } else { int i = 0; for (GenericEvent event : events) { boolean test = i % 10 == segment; event.setTest(test); i++; } } MaxentModel maxentModel = this.train(events, null); FScoreCalculator<String> fscoreCalculator = this.evaluate(maxentModel, events, fscoreFileWriter, outcomeFileWriter); accuracyStats.addValue(fscoreCalculator.getTotalFScore()); for (String outcome : fscoreCalculator.getOutcomeSet()) { DescriptiveStatistics[] stats = outcomeFscoreStats.get(outcome); if (stats == null) { stats = new DescriptiveStatistics[3]; stats[0] = new DescriptiveStatistics(); stats[1] = new DescriptiveStatistics(); stats[2] = new DescriptiveStatistics(); outcomeFscoreStats.put(outcome, stats); } stats[0].addValue(fscoreCalculator.getPrecision(outcome)); stats[1].addValue(fscoreCalculator.getRecall(outcome)); stats[2].addValue(fscoreCalculator.getFScore(outcome)); } // next outcome outcomeFileWriter.write("\n"); } // next segment fscoreFileWriter.write( "outcome,precision avg., precision dev., recall avg., recall dev., f-score avg., f-score dev.,\n"); for (String outcome : outcomeFscoreStats.keySet()) { DescriptiveStatistics[] stats = outcomeFscoreStats.get(outcome); fscoreFileWriter .write(CSVFormatter.format(outcome) + "," + CSVFormatter.format(stats[0].getMean()) + "," + CSVFormatter.format(stats[0].getStandardDeviation()) + "," + CSVFormatter.format(stats[1].getMean()) + "," + CSVFormatter.format(stats[1].getStandardDeviation()) + "," + CSVFormatter.format(stats[2].getMean()) + "," + CSVFormatter.format(stats[2].getStandardDeviation()) + "," + "\n"); } fscoreFileWriter.write("TOTAL,,,,," + CSVFormatter.format(accuracyStats.getMean()) + "," + CSVFormatter.format(accuracyStats.getStandardDeviation()) + ",\n"); LOG.info("Accuracy mean: " + accuracyStats.getMean()); LOG.info("Accuracy std dev: " + accuracyStats.getStandardDeviation()); } } finally { fscoreFileWriter.flush(); fscoreFileWriter.close(); outcomeFileWriter.flush(); outcomeFileWriter.close(); } LOG.info("#### Complete ####"); }
From source file:net.sf.mzmine.modules.peaklistmethods.dataanalysis.heatmaps.HeatMapTask.java
private double[][] groupingDataset(UserParameter selectedParameter, String referenceGroup) { // Collect all data files Vector<RawDataFile> allDataFiles = new Vector<RawDataFile>(); DescriptiveStatistics meanControlStats = new DescriptiveStatistics(); DescriptiveStatistics meanGroupStats = new DescriptiveStatistics(); allDataFiles.addAll(Arrays.asList(peakList.getRawDataFiles())); // Determine the reference group and non reference group (the rest of // the samples) for raw data files List<RawDataFile> referenceDataFiles = new ArrayList<RawDataFile>(); List<RawDataFile> nonReferenceDataFiles = new ArrayList<RawDataFile>(); List<String> groups = new ArrayList<String>(); MZmineProject project = MZmineCore.getCurrentProject(); for (RawDataFile rawDataFile : allDataFiles) { Object paramValue = project.getParameterValue(selectedParameter, rawDataFile); if (!groups.contains(String.valueOf(paramValue))) { groups.add(String.valueOf(paramValue)); }//w w w. java 2 s .co m if (String.valueOf(paramValue).equals(referenceGroup)) { referenceDataFiles.add(rawDataFile); } else { nonReferenceDataFiles.add(rawDataFile); } } int numRows = 0; for (int row = 0; row < peakList.getNumberOfRows(); row++) { if (!onlyIdentified || (onlyIdentified && peakList.getRow(row).getPeakIdentities().length > 0)) { numRows++; } } // Create a new aligned peak list with all the samples if the reference // group has to be shown or with only // the non reference group if not. double[][] dataMatrix = new double[groups.size() - 1][numRows]; pValueMatrix = new String[groups.size() - 1][numRows]; // data files that should be in the heat map List<RawDataFile> shownDataFiles = nonReferenceDataFiles; for (int row = 0, rowIndex = 0; row < peakList.getNumberOfRows(); row++) { PeakListRow rowPeak = peakList.getRow(row); if (!onlyIdentified || (onlyIdentified && rowPeak.getPeakIdentities().length > 0)) { // Average area or height of the reference group meanControlStats.clear(); for (int column = 0; column < referenceDataFiles.size(); column++) { if (rowPeak.getPeak(referenceDataFiles.get(column)) != null) { if (area) { meanControlStats.addValue(rowPeak.getPeak(referenceDataFiles.get(column)).getArea()); } else { meanControlStats.addValue(rowPeak.getPeak(referenceDataFiles.get(column)).getHeight()); } } } // Divide the area or height of each peak by the average of the // area or height of the reference peaks in each row int columnIndex = 0; for (int column = 0; column < groups.size(); column++) { String group = groups.get(column); meanGroupStats.clear(); if (!group.equals(referenceGroup)) { for (int dataColumn = 0; dataColumn < shownDataFiles.size(); dataColumn++) { Object paramValue = project.getParameterValue(selectedParameter, shownDataFiles.get(dataColumn)); if (rowPeak.getPeak(shownDataFiles.get(dataColumn)) != null && String.valueOf(paramValue).equals(group)) { Feature peak = rowPeak.getPeak(shownDataFiles.get(dataColumn)); if (!Double.isInfinite(peak.getArea()) && !Double.isNaN(peak.getArea())) { if (area) { meanGroupStats.addValue(peak.getArea()); } else { meanGroupStats.addValue(peak.getHeight()); } } } } double value = meanGroupStats.getMean() / meanControlStats.getMean(); if (meanGroupStats.getN() > 1 && meanControlStats.getN() > 1) { pValueMatrix[columnIndex][rowIndex] = this.getPvalue(meanGroupStats, meanControlStats); } else { pValueMatrix[columnIndex][rowIndex] = ""; } if (log) { value = Math.log(value); } dataMatrix[columnIndex++][rowIndex] = value; } } rowIndex++; } } // Scale the data dividing the peak area/height by the standard // deviation of each column if (scale) { scale(dataMatrix); } // Create two arrays: row and column names rowNames = new String[dataMatrix[0].length]; colNames = new String[groups.size() - 1]; int columnIndex = 0; for (String group : groups) { if (!group.equals(referenceGroup)) { colNames[columnIndex++] = group; } } for (int row = 0, rowIndex = 0; row < peakList.getNumberOfRows(); row++) { if (!onlyIdentified || (onlyIdentified && peakList.getRow(row).getPeakIdentities().length > 0)) { if (peakList.getRow(row).getPeakIdentities() != null && peakList.getRow(row).getPeakIdentities().length > 0) { rowNames[rowIndex++] = peakList.getRow(row).getPreferredPeakIdentity().getName(); } else { rowNames[rowIndex++] = "Unknown"; } } } return dataMatrix; }
From source file:datafu.hourglass.jobs.StagedOutputJob.java
/** * Writes Hadoop counters and other task statistics to a file in the file system. * /*from w w w. j ava2 s .co m*/ * @param fs * @throws IOException */ private void writeCounters(final FileSystem fs) throws IOException { final Path actualOutputPath = FileOutputFormat.getOutputPath(this); SimpleDateFormat timestampFormat = new SimpleDateFormat("yyyyMMddHHmmss"); String suffix = timestampFormat.format(new Date()); if (_countersParentPath != null) { if (!fs.exists(_countersParentPath)) { _log.info("Creating counter parent path " + _countersParentPath); fs.mkdirs(_countersParentPath, FsPermission.valueOf("-rwxrwxr-x")); } // make the name as unique as possible in this case because this may be a directory // where other counter files will be dropped _countersPath = new Path(_countersParentPath, ".counters." + suffix); } else { _countersPath = new Path(actualOutputPath, ".counters." + suffix); } _log.info(String.format("Writing counters to %s", _countersPath)); FSDataOutputStream counterStream = fs.create(_countersPath); BufferedOutputStream buffer = new BufferedOutputStream(counterStream, 256 * 1024); OutputStreamWriter writer = new OutputStreamWriter(buffer); for (String groupName : getCounters().getGroupNames()) { for (Counter counter : getCounters().getGroup(groupName)) { writeAndLog(writer, String.format("%s=%d", counter.getName(), counter.getValue())); } } JobID jobID = this.getJobID(); org.apache.hadoop.mapred.JobID oldJobId = new org.apache.hadoop.mapred.JobID(jobID.getJtIdentifier(), jobID.getId()); long minStart = Long.MAX_VALUE; long maxFinish = 0; long setupStart = Long.MAX_VALUE; long cleanupFinish = 0; DescriptiveStatistics mapStats = new DescriptiveStatistics(); DescriptiveStatistics reduceStats = new DescriptiveStatistics(); boolean success = true; JobClient jobClient = new JobClient(this.conf); Map<String, String> taskIdToType = new HashMap<String, String>(); TaskReport[] setupReports = jobClient.getSetupTaskReports(oldJobId); if (setupReports.length > 0) { _log.info("Processing setup reports"); for (TaskReport report : jobClient.getSetupTaskReports(oldJobId)) { taskIdToType.put(report.getTaskID().toString(), "SETUP"); if (report.getStartTime() == 0) { _log.warn("Skipping report with zero start time"); continue; } setupStart = Math.min(setupStart, report.getStartTime()); } } else { _log.error("No setup reports"); } TaskReport[] mapReports = jobClient.getMapTaskReports(oldJobId); if (mapReports.length > 0) { _log.info("Processing map reports"); for (TaskReport report : mapReports) { taskIdToType.put(report.getTaskID().toString(), "MAP"); if (report.getFinishTime() == 0 || report.getStartTime() == 0) { _log.warn("Skipping report with zero start or finish time"); continue; } minStart = Math.min(minStart, report.getStartTime()); mapStats.addValue(report.getFinishTime() - report.getStartTime()); } } else { _log.error("No map reports"); } TaskReport[] reduceReports = jobClient.getReduceTaskReports(oldJobId); if (reduceReports.length > 0) { _log.info("Processing reduce reports"); for (TaskReport report : reduceReports) { taskIdToType.put(report.getTaskID().toString(), "REDUCE"); if (report.getFinishTime() == 0 || report.getStartTime() == 0) { _log.warn("Skipping report with zero start or finish time"); continue; } maxFinish = Math.max(maxFinish, report.getFinishTime()); reduceStats.addValue(report.getFinishTime() - report.getStartTime()); } } else { _log.error("No reduce reports"); } TaskReport[] cleanupReports = jobClient.getCleanupTaskReports(oldJobId); if (cleanupReports.length > 0) { _log.info("Processing cleanup reports"); for (TaskReport report : cleanupReports) { taskIdToType.put(report.getTaskID().toString(), "CLEANUP"); if (report.getFinishTime() == 0) { _log.warn("Skipping report with finish time of zero"); continue; } cleanupFinish = Math.max(cleanupFinish, report.getFinishTime()); } } else { _log.error("No cleanup reports"); } if (minStart == Long.MAX_VALUE) { _log.error("Could not determine map-reduce start time"); success = false; } if (maxFinish == 0) { _log.error("Could not determine map-reduce finish time"); success = false; } if (setupStart == Long.MAX_VALUE) { _log.error("Could not determine setup start time"); success = false; } if (cleanupFinish == 0) { _log.error("Could not determine cleanup finish time"); success = false; } // Collect statistics on successful/failed/killed task attempts, categorized by setup/map/reduce/cleanup. // Unfortunately the job client doesn't have an easier way to get these statistics. Map<String, Integer> attemptStats = new HashMap<String, Integer>(); _log.info("Processing task attempts"); for (TaskCompletionEvent event : getTaskCompletionEvents(jobClient, oldJobId)) { String type = taskIdToType.get(event.getTaskAttemptId().getTaskID().toString()); String status = event.getTaskStatus().toString(); String key = String.format("%s_%s_ATTEMPTS", status, type); if (!attemptStats.containsKey(key)) { attemptStats.put(key, 0); } attemptStats.put(key, attemptStats.get(key) + 1); } if (success) { writeAndLog(writer, String.format("SETUP_START_TIME_MS=%d", setupStart)); writeAndLog(writer, String.format("CLEANUP_FINISH_TIME_MS=%d", cleanupFinish)); writeAndLog(writer, String.format("COMPLETE_WALL_CLOCK_TIME_MS=%d", cleanupFinish - setupStart)); writeAndLog(writer, String.format("MAP_REDUCE_START_TIME_MS=%d", minStart)); writeAndLog(writer, String.format("MAP_REDUCE_FINISH_TIME_MS=%d", maxFinish)); writeAndLog(writer, String.format("MAP_REDUCE_WALL_CLOCK_TIME_MS=%d", maxFinish - minStart)); writeAndLog(writer, String.format("MAP_TOTAL_TASKS=%d", (long) mapStats.getN())); writeAndLog(writer, String.format("MAP_MAX_TIME_MS=%d", (long) mapStats.getMax())); writeAndLog(writer, String.format("MAP_MIN_TIME_MS=%d", (long) mapStats.getMin())); writeAndLog(writer, String.format("MAP_AVG_TIME_MS=%d", (long) mapStats.getMean())); writeAndLog(writer, String.format("MAP_STD_TIME_MS=%d", (long) mapStats.getStandardDeviation())); writeAndLog(writer, String.format("MAP_SUM_TIME_MS=%d", (long) mapStats.getSum())); writeAndLog(writer, String.format("REDUCE_TOTAL_TASKS=%d", (long) reduceStats.getN())); writeAndLog(writer, String.format("REDUCE_MAX_TIME_MS=%d", (long) reduceStats.getMax())); writeAndLog(writer, String.format("REDUCE_MIN_TIME_MS=%d", (long) reduceStats.getMin())); writeAndLog(writer, String.format("REDUCE_AVG_TIME_MS=%d", (long) reduceStats.getMean())); writeAndLog(writer, String.format("REDUCE_STD_TIME_MS=%d", (long) reduceStats.getStandardDeviation())); writeAndLog(writer, String.format("REDUCE_SUM_TIME_MS=%d", (long) reduceStats.getSum())); writeAndLog(writer, String.format("MAP_REDUCE_SUM_TIME_MS=%d", (long) mapStats.getSum() + (long) reduceStats.getSum())); for (Map.Entry<String, Integer> attemptStat : attemptStats.entrySet()) { writeAndLog(writer, String.format("%s=%d", attemptStat.getKey(), attemptStat.getValue())); } } writer.close(); buffer.close(); counterStream.close(); }
From source file:de.tudarmstadt.ukp.experiments.argumentation.sequence.feature.coreference.CoreferenceFeatures.java
@Override protected List<Feature> extract(JCas jCas, Sentence sentence, String sentencePrefix) throws TextClassificationException { List<List<CoreferenceLink>> coreferenceChains = extractCoreferenceChains(jCas); FrequencyDistribution<String> featuresAcrossAllChains = new FrequencyDistribution<>(); DescriptiveStatistics chainLength = new DescriptiveStatistics(); DescriptiveStatistics distanceToPreviousSentence = new DescriptiveStatistics(); DescriptiveStatistics distanceToNextSentence = new DescriptiveStatistics(); DescriptiveStatistics interSentencesCorLinks = new DescriptiveStatistics(); for (List<CoreferenceLink> chain : coreferenceChains) { SortedMap<Integer, List<CoreferenceLink>> sentencesAndLinks = extractSentencesAndLinksFromChain(chain, jCas);// www . j av a 2 s . co m int currentSentencePos = getCurrentSentencePos(jCas, sentence); log.debug(sentencesAndLinks.keySet() + ", current " + currentSentencePos); // is the sentence in chain that spans more sentences? boolean partOfChain = sentencesAndLinks.containsKey(currentSentencePos) && sentencesAndLinks.size() > 1; // is part of a chain? if (partOfChain) { log.debug(chainToString(chain)); featuresAcrossAllChains.inc(FN_PART_OF_CHAIN); // starts the chain? if (sentencesAndLinks.firstKey().equals(currentSentencePos)) { featuresAcrossAllChains.inc(FN_STARTS_THE_CHAIN); } else if (sentencesAndLinks.lastKey().equals(currentSentencePos)) { // ends the chain? featuresAcrossAllChains.inc(FN_ENDS_THE_CHAIN); } else { // in the middle of chain? featuresAcrossAllChains.inc(FN_IN_THE_MIDDLE_OF_CHAIN); } // length of the chain chainLength.addValue(sentencesAndLinks.size()); List<CoreferenceLink> currentSentenceLinks = sentencesAndLinks.get(currentSentencePos); CoreferenceLink currentSentenceFirstLink = currentSentenceLinks.get(0); CoreferenceLink currentSentenceLastLink = currentSentenceLinks.get(currentSentenceLinks.size() - 1); // transition to the previous link, i.e. NOMINAL -> PRONOMINAL if (!sentencesAndLinks.firstKey().equals(currentSentencePos)) { // find the previous sentence List<CoreferenceLink> previousSentenceLinks = null; int prevSentNo = currentSentencePos; while (previousSentenceLinks == null && prevSentNo >= 0) { prevSentNo--; if (sentencesAndLinks.containsKey(prevSentNo)) { previousSentenceLinks = sentencesAndLinks.get(prevSentNo); } } if (previousSentenceLinks == null) { throw new IllegalStateException("Oops :))"); } // distance to previous sentence distanceToPreviousSentence.addValue(currentSentencePos - prevSentNo); // get the last link from the previous sentence CoreferenceLink prevSentenceLastLink = previousSentenceLinks .get(previousSentenceLinks.size() - 1); // add type type transition String prevSentenceLastLinkReferenceType = prevSentenceLastLink.getReferenceType(); String currentSentenceFirstLinkReferenceType = currentSentenceFirstLink.getReferenceType(); String transitionType = prevSentenceLastLinkReferenceType + GLUE + currentSentenceFirstLinkReferenceType; featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TYPE_TYPE + transitionType, 1); // add token - type transition String glueCoreferenceCurrentSentence = glueCoreferenceLinkTokens(currentSentenceFirstLink); String typeToken = prevSentenceLastLinkReferenceType + GLUE + glueCoreferenceCurrentSentence; featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TYPE_TOKEN + typeToken, 1); // add type - token transition String glueCoreferencePrevSentence = glueCoreferenceLinkTokens(prevSentenceLastLink); String tokenType = glueCoreferencePrevSentence + GLUE + currentSentenceFirstLinkReferenceType; featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TOKEN_TYPE + tokenType, 1); // add token token transition String tokenToken = glueCoreferencePrevSentence + GLUE + glueCoreferenceCurrentSentence; featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TOKEN_TOKEN + tokenToken, 1); // exact matching token-token reference? if (glueCoreferencePrevSentence.equals(glueCoreferenceCurrentSentence)) { featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TOKEN_TOKEN_MATCH, 1); } } // transition to the previous link, i.e. NOMINAL -> PRONOMINAL if (!sentencesAndLinks.lastKey().equals(currentSentencePos)) { // find the previous sentence List<CoreferenceLink> nextSentenceLinks = null; int nextSentNo = currentSentencePos; while (nextSentenceLinks == null && nextSentNo <= sentencesAndLinks.lastKey()) { nextSentNo++; if (sentencesAndLinks.containsKey(nextSentNo)) { nextSentenceLinks = sentencesAndLinks.get(nextSentNo); } } if (nextSentenceLinks == null) { throw new IllegalStateException("Oops :))"); } // distance to next sentence distanceToNextSentence.addValue(nextSentNo - currentSentencePos); // get the last link from the previous sentence CoreferenceLink nextSentenceFirstLink = nextSentenceLinks.get(0); // add type type transition String currentSentenceLastLinkReferenceType = currentSentenceLastLink.getReferenceType(); String nextSentenceFirstLinkReferenceType = nextSentenceFirstLink.getReferenceType(); String transitionType = currentSentenceLastLinkReferenceType + GLUE + nextSentenceFirstLinkReferenceType; featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TYPE_TYPE + transitionType, 1); // add token - type transition String glueCoreferenceCurrentSent = glueCoreferenceLinkTokens(currentSentenceLastLink); String typeToken = glueCoreferenceCurrentSent + GLUE + nextSentenceFirstLinkReferenceType; featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TOKEN_TYPE + typeToken, 1); // add type - token transition String glueCoreferenceNextSent = glueCoreferenceLinkTokens(nextSentenceFirstLink); String tokenType = currentSentenceLastLinkReferenceType + GLUE + glueCoreferenceNextSent; featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TYPE_TOKEN + tokenType, 1); // add token token transition String tokenToken = glueCoreferenceCurrentSent + GLUE + glueCoreferenceNextSent; featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TOKEN_TOKEN + tokenToken, 1); // exact matching token-token reference? if (glueCoreferenceNextSent.equals(glueCoreferenceCurrentSent)) { featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TOKEN_TOKEN_MATCH, 1); } } } // number of inter-sentence coreference links if (sentencesAndLinks.containsKey(currentSentencePos)) { int coreferenceLinks = sentencesAndLinks.get(currentSentencePos).size(); interSentencesCorLinks.addValue(coreferenceLinks); } /* List<Integer> positions = positionsOfSentenceInCurrentChain(chain, sentence); // ok, we're in a chain if (!positions.isEmpty()) { log.debug(printChain(chain)); log.debug(sentence.getCoveredText()); log.debug(positions); Integer lastPosition = positions.get(positions.size() - 1); Integer firstPosition = positions.get(0); if (lastPosition == positions.size() - 1) { log.debug("Last sentence of chain"); } log.debug("-----"); } */ } List<Feature> result = new ArrayList<>(); log.debug(featuresAcrossAllChains); if (distanceToNextSentence.getN() > 0) { log.debug("Next:" + distanceToNextSentence); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_NEXT_MIN, distanceToNextSentence.getMin())); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_NEXT_MAX, distanceToNextSentence.getMax())); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_NEXT_AVG, distanceToNextSentence.getMean())); } if (distanceToPreviousSentence.getN() > 0) { log.debug("Prev: " + distanceToPreviousSentence); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_PREV_MIN, distanceToPreviousSentence.getMin())); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_PREV_MAX, distanceToPreviousSentence.getMax())); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_PREV_AVG, distanceToPreviousSentence.getMean())); } if (interSentencesCorLinks.getN() > 0) { result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_INTER_SENT_COR_MIN, interSentencesCorLinks.getMin())); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_INTER_SENT_COR_MAX, interSentencesCorLinks.getMax())); result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_INTER_SENT_COR_AVG, interSentencesCorLinks.getMean())); } log.debug("----"); for (String feat : featuresAcrossAllChains.getKeys()) { // binary result.add(new Feature(sentencePrefix + FEATURE_NAME + feat, 1)); } return result; }
From source file:org.a3badran.platform.logging.writer.MetricsWriter.java
private Map<String, Long> getAllMetrics() { Map<String, Long> metrics = new HashMap<String, Long>(); for (Entry<String, DescriptiveStatistics> entry : sampleMetrics.entrySet()) { // create a copy to reduce locking String name = entry.getKey(); DescriptiveStatistics stats = entry.getValue().copy(); metrics.put(name + ".sampleCount", (long) stats.getN()); metrics.put(name + ".max", (long) stats.getMax()); metrics.put(name + ".min", (long) stats.getMin()); metrics.put(name + ".avg", (long) stats.getMean()); metrics.put(name + ".50p", (long) stats.getPercentile(50)); metrics.put(name + ".90p", (long) stats.getPercentile(90)); metrics.put(name + ".99p", (long) stats.getPercentile(99)); }// w w w. j a v a 2 s .c om for (Entry<String, DescriptiveStatistics> cEntry : sampleCounterMetrics.entrySet()) { // create a copy to reduce locking String cName = cEntry.getKey(); DescriptiveStatistics cStats = cEntry.getValue().copy(); metrics.put(cName + ".max", (long) cStats.getMax()); metrics.put(cName + ".min", (long) cStats.getMin()); metrics.put(cName + ".avg", (long) cStats.getMean()); metrics.put(cName + ".50p", (long) cStats.getPercentile(50)); metrics.put(cName + ".90p", (long) cStats.getPercentile(90)); metrics.put(cName + ".99p", (long) cStats.getPercentile(99)); } for (Entry<String, AtomicLong> entry : scopeTotalMetrics.entrySet()) { metrics.put(entry.getKey(), entry.getValue().longValue()); } for (Entry<String, AtomicLong> entry : appTotalMetrics.entrySet()) { metrics.put(entry.getKey(), entry.getValue().longValue()); } return metrics; }
From source file:org.bresearch.websec.test.CommonsMathTest.java
public void test1() throws Exception { /* min, max, mean, geometric mean, n, sum, sum of squares, * standard deviation, variance, percentiles, skewness, kurtosis, median */ // Get a DescriptiveStatistics instance using factory method DescriptiveStatistics stats = new DescriptiveStatistics(); final double[] inputArray = { 4, 3, 3, 2 }; // Add the data from the array for (int i = 0; i < inputArray.length; i++) { stats.addValue(inputArray[i]);/*from w w w . j ava 2 s .com*/ } // Compute some statistics double mean = stats.getMean(); double std = stats.getStandardDeviation(); long n = stats.getN(); assertEquals("3.0", "" + mean); assertEquals("0.816496580927726", "" + std); }
From source file:org.bresearch.websec.test.CommonsMathTest.java
public void test2() { final String data2 = (new WordProcessor()).filterOnlyAlphaNumeric( " !!! Hello my name is a person. Hello how are you doing. hello, this is great. What do you think? "); final BotlistStringUtils utils = new BotlistStringUtils(); final List<String> a = utils.buildWordList(data2); assertEquals(27, a.size());/*from w w w .j av a 2 s . c o m*/ DescriptiveStatistics stats = new DescriptiveStatistics(); for (int i = 0; i < utils.mapReduceCount(a, -1).length; i++) { stats.addValue(utils.mapReduceCount(a, -1)[i]); } // Compute some statistics double mean = stats.getMean(); double std = stats.getStandardDeviation(); assertEquals("1.2666666666666666", "" + mean); assertEquals("0.5936168397046634", "" + std); long n = stats.getN(); assertEquals("15", "" + n); }
From source file:org.bresearch.websec.test.CommonsMathTest.java
public void test3() { final String data2 = (new WordProcessor()).filterOnlyAlphaNumeric( " !!! Hello my name is a person. Hello how are you doing. hello, this is great. What do you think? "); final BotlistStringUtils utils = new BotlistStringUtils(); final List<String> a = utils.buildWordList(data2); DescriptiveStatistics stats = new DescriptiveStatistics(); for (int i = 0; i < utils.mapReduceWordSize(a, -1).length; i++) { stats.addValue(utils.mapReduceWordSize(a, -1)[i]); }/*from www .j a va 2 s.c o m*/ // Compute some statistics assertEquals("3.6", "" + stats.getMean()); assertEquals("54.0", "" + stats.getSum()); }
From source file:org.bresearch.websec.test.CommonsMathTest.java
public void test5() { final DocumentWordStats docStats = new DocumentWordStats(ConstDoc.CONST_SM); final DescriptiveStatistics stats = docStats.mapReduceStats(); System.out.println("" + stats.getSum()); System.out.println("" + stats.getMean()); System.out.println("" + stats.getN()); System.out.println("" + stats.getGeometricMean()); System.out.println("" + stats.getMax()); }