List of usage examples for org.apache.commons.math.stat.descriptive DescriptiveStatistics addValue
public void addValue(double v)
From source file:com.joliciel.jochre.graphics.SourceImageImpl.java
void calculateShapeStatistics() { if (!shapeStatisticsCalculated) { DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics(); for (RowOfShapes row : this.getRows()) { for (Shape shape : row.getShapes()) { shapeWidthStats.addValue(shape.getWidth()); shapeHeightStats.addValue(shape.getHeight()); }//from ww w . j a v a2 s . c om } double minWidth = shapeWidthStats.getPercentile(50); double maxWidth = shapeWidthStats.getPercentile(80); double minHeight = shapeHeightStats.getPercentile(50); double maxHeight = shapeHeightStats.getPercentile(80); this.averageShapeWidth = shapeWidthStats.getPercentile(65); this.averageShapeHeight = shapeHeightStats.getPercentile(65); this.averageShapeWidthMargin = (maxWidth - minWidth) / 2.0; this.averageShapeHeightMargin = (maxHeight - minHeight) / 2.0; this.shapeStatisticsCalculated = true; } }
From source file:com.mozilla.socorro.hadoop.RawDumpSize.java
public int run(String[] args) throws Exception { if (args.length != 1) { return printUsage(); }//from w w w . j a v a 2 s .c o m int rc = -1; Job job = initJob(args); job.waitForCompletion(true); if (job.isSuccessful()) { rc = 0; FileSystem hdfs = null; DescriptiveStatistics rawStats = new DescriptiveStatistics(); long rawTotal = 0L; DescriptiveStatistics processedStats = new DescriptiveStatistics(); long processedTotal = 0L; try { hdfs = FileSystem.get(job.getConfiguration()); Pattern tabPattern = Pattern.compile("\t"); for (FileStatus status : hdfs.listStatus(FileOutputFormat.getOutputPath(job))) { if (!status.isDir()) { BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader(hdfs.open(status.getPath()))); String line = null; while ((line = reader.readLine()) != null) { String[] splits = tabPattern.split(line); int byteSize = Integer.parseInt(splits[2]); if ("raw".equals(splits[1])) { rawStats.addValue(byteSize); rawTotal += byteSize; } else if ("processed".equals(splits[1])) { processedStats.addValue(byteSize); processedTotal += byteSize; } } } finally { if (reader != null) { reader.close(); } } } } } finally { if (hdfs != null) { hdfs.close(); } } System.out.println("===== " + job.getConfiguration().get(START_DATE) + " raw_data:dump ====="); System.out.println(String.format("Min: %.02f Max: %.02f Mean: %.02f", rawStats.getMin(), rawStats.getMax(), rawStats.getMean())); System.out.println(String.format("1st Quartile: %.02f 2nd Quartile: %.02f 3rd Quartile: %.02f", rawStats.getPercentile(25.0d), rawStats.getPercentile(50.0d), rawStats.getPercentile(75.0d))); System.out.println("Total Bytes: " + rawTotal); System.out.println("===== " + job.getConfiguration().get(START_DATE) + " processed_data:json ====="); System.out.println(String.format("Min: %.02f Max: %.02f Mean: %.02f", processedStats.getMin(), processedStats.getMax(), processedStats.getMean())); System.out.println(String.format("1st Quartile: %.02f 2nd Quartile: %.02f 3rd Quartile: %.02f", processedStats.getPercentile(25.0d), processedStats.getPercentile(50.0d), processedStats.getPercentile(75.0d))); System.out.println("Total Bytes: " + processedTotal); } return rc; }
From source file:com.joliciel.jochre.graphics.RowOfShapesImpl.java
void calculateShapeStatistics() { if (!shapeStatisticsCalculated) { DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics(); for (Shape shape : this.getShapes()) { shapeWidthStats.addValue(shape.getWidth()); shapeHeightStats.addValue(shape.getHeight()); }/*from ww w.j a v a 2s . co m*/ double minWidth = shapeWidthStats.getPercentile(33); double maxWidth = shapeWidthStats.getPercentile(66); double minHeight = shapeHeightStats.getPercentile(33); double maxHeight = shapeHeightStats.getPercentile(66); this.averageShapeWidth = shapeWidthStats.getPercentile(50); this.averageShapeHeight = shapeHeightStats.getPercentile(50); this.averageShapeWidthMargin = (maxWidth - minWidth) / 2.0; this.averageShapeHeightMargin = (maxHeight - minHeight) / 2.0; this.shapeStatisticsCalculated = true; } }
From source file:com.userweave.module.methoden.iconunderstandability.service.ComputeIconTestStatisticsImpl.java
/** * return regression, if regression can be computed * @return//from ww w . j ava2s . co m */ private OverallStatistics computeOverallStatistics() { SimpleRegression regression = new SimpleRegression(); DescriptiveStatistics overallStatistics = DescriptiveStatistics.newInstance(); Map<Integer, DescriptiveStatistics> iconCount2Statistics = new HashMap<Integer, DescriptiveStatistics>(); List<Object[]> executionTimesIconCount = testResultDao.findAllValidExecutionTimesAndIconCount(); if (!executionTimesIconCount.isEmpty()) { // check, if there is variation in x (only one x value for all observation yield NaN!) boolean canComputeRegression = false; int iconCountForFirstResult = ((Long) executionTimesIconCount.get(0)[1]).intValue(); for (Object[] executionTimeIconCount : executionTimesIconCount) { int iconCount = ((Long) executionTimeIconCount[1]).intValue(); if (iconCount != iconCountForFirstResult) { canComputeRegression = true; } double executionTime = (Long) executionTimeIconCount[0]; if (isValid(executionTime)) { regression.addData(iconCount, executionTime); overallStatistics.addValue(executionTime); getStatisticsForIconCount(iconCount2Statistics, iconCount).addValue(executionTime); } } if (canComputeRegression) { return new OverallStatistics(regression, overallStatistics.getMean(), iconCount2Statistics); } else { return new OverallStatistics(null, overallStatistics.getMean(), iconCount2Statistics); } } else { return null; } }
From source file:com.joliciel.jochre.graphics.SourceImageImpl.java
public List<Rectangle> findColumnSeparators() { if (columnSeparators == null) { LOG.debug("############ findColumnSeparators ##############"); double slope = this.getMeanHorizontalSlope(); double imageMidPointX = (double) this.getWidth() / 2.0; int[] horizontalCounts = new int[this.getHeight()]; DescriptiveStatistics rowXHeightStats = new DescriptiveStatistics(); // first get the fill factor for each horizontal row in the image for (RowOfShapes row : this.getRows()) { rowXHeightStats.addValue(row.getXHeight()); for (Shape shape : row.getShapes()) { double shapeMidPointX = (double) (shape.getLeft() + shape.getRight()) / 2.0; int slopeAdjustedTop = (int) Math .round(shape.getTop() + (slope * (shapeMidPointX - imageMidPointX))); if (slopeAdjustedTop >= 0 && slopeAdjustedTop < this.getHeight()) { for (int i = 0; i < shape.getHeight(); i++) { if (slopeAdjustedTop + i < horizontalCounts.length) horizontalCounts[slopeAdjustedTop + i] += shape.getWidth(); }/* ww w. j a v a 2 s . com*/ } } } DescriptiveStatistics horizontalStats = new DescriptiveStatistics(); DescriptiveStatistics horizontalStatsNonEmpty = new DescriptiveStatistics(); for (int i = 0; i < this.getHeight(); i++) { // LOG.trace("Row " + i + ": " + horizontalCounts[i]); horizontalStats.addValue(horizontalCounts[i]); if (horizontalCounts[i] > 0) horizontalStatsNonEmpty.addValue(horizontalCounts[i]); } LOG.debug("Mean horizontal count: " + horizontalStats.getMean()); LOG.debug("Median horizontal count: " + horizontalStats.getPercentile(50)); LOG.debug("25 percentile horizontal count: " + horizontalStats.getPercentile(25)); LOG.debug("Mean horizontal count (non empty): " + horizontalStatsNonEmpty.getMean()); LOG.debug("Median horizontal count (non empty): " + horizontalStatsNonEmpty.getPercentile(50)); LOG.debug("25 percentile horizontal count (non empty): " + horizontalStatsNonEmpty.getPercentile(25)); LOG.debug("10 percentile horizontal count (non empty): " + horizontalStatsNonEmpty.getPercentile(10)); double maxEmptyRowCount = horizontalStatsNonEmpty.getMean() / 8.0; LOG.debug("maxEmptyRowCount: " + maxEmptyRowCount); boolean inEmptyHorizontalRange = false; List<int[]> emptyHorizontalRanges = new ArrayList<int[]>(); int emptyHorizontalRangeStart = 0; for (int i = 0; i < this.getHeight(); i++) { if (!inEmptyHorizontalRange && horizontalCounts[i] <= maxEmptyRowCount) { inEmptyHorizontalRange = true; emptyHorizontalRangeStart = i; } else if (inEmptyHorizontalRange && horizontalCounts[i] > maxEmptyRowCount) { inEmptyHorizontalRange = false; emptyHorizontalRanges.add(new int[] { emptyHorizontalRangeStart, i }); } } if (inEmptyHorizontalRange) { emptyHorizontalRanges.add(new int[] { emptyHorizontalRangeStart, this.getHeight() - 1 }); } LOG.debug("rowXHeight mean: " + rowXHeightStats.getMean()); LOG.debug("rowXHeight median: " + rowXHeightStats.getPercentile(50)); double minHorizontalBreak = rowXHeightStats.getMean() * 2.0; LOG.debug("minHorizontalBreak: " + minHorizontalBreak); int smallBreakCount = 0; int mainTextTop = 0; int bigBreakCount = 0; for (int[] emptyHorizontalRange : emptyHorizontalRanges) { int height = emptyHorizontalRange[1] - emptyHorizontalRange[0]; LOG.trace("empty range: " + emptyHorizontalRange[0] + ", " + emptyHorizontalRange[1] + " = " + height); if (bigBreakCount < 2 && smallBreakCount < 2 && height > minHorizontalBreak) { mainTextTop = emptyHorizontalRange[1]; bigBreakCount++; } if (height <= minHorizontalBreak) smallBreakCount++; } LOG.debug("mainTextTop:" + mainTextTop); // lift mainTextTop upwards by max an x-height or till we reach a zero row int minTop = mainTextTop - (int) (rowXHeightStats.getMean() / 2.0); if (minTop < 0) minTop = 0; for (int i = mainTextTop; i > minTop; i--) { mainTextTop = i; if (horizontalCounts[i] == 0) { break; } } LOG.debug("mainTextTop (adjusted):" + mainTextTop); smallBreakCount = 0; bigBreakCount = 0; int mainTextBottom = this.getHeight(); for (int i = emptyHorizontalRanges.size() - 1; i >= 0; i--) { int[] emptyHorizontalRange = emptyHorizontalRanges.get(i); int height = emptyHorizontalRange[1] - emptyHorizontalRange[0]; LOG.trace("emptyHorizontalRange: " + emptyHorizontalRange[0] + ", height: " + height + ", bigBreakCount: " + bigBreakCount + ", smallBreakCount: " + smallBreakCount); if ((bigBreakCount + smallBreakCount) <= 2 && height > minHorizontalBreak) { mainTextBottom = emptyHorizontalRange[0]; LOG.trace("Set mainTextBottom to " + mainTextBottom); bigBreakCount++; } if (height <= minHorizontalBreak) smallBreakCount++; if ((bigBreakCount + smallBreakCount) > 2) break; } LOG.debug("mainTextBottom:" + mainTextBottom); // lower mainTextBottom downwards by max an x-height or till we reach a zero row int maxBottom = mainTextBottom + (int) (rowXHeightStats.getMean() / 2.0); if (maxBottom > this.getHeight()) maxBottom = this.getHeight(); for (int i = mainTextBottom; i < maxBottom; i++) { mainTextBottom = i; if (horizontalCounts[i] == 0) { break; } } LOG.debug("mainTextBottom (adjusted):" + mainTextBottom); int[] verticalCounts = new int[this.getWidth()]; // first get the fill factor for each horizontal row in the image for (RowOfShapes row : this.getRows()) { for (Shape shape : row.getShapes()) { int slopeAdjustedLeft = (int) Math.round(shape.getLeft() - row.getXAdjustment()); double shapeMidPointX = (double) (shape.getLeft() + shape.getRight()) / 2.0; int slopeAdjustedTop = (int) Math .round(shape.getTop() + (slope * (shapeMidPointX - imageMidPointX))); if (slopeAdjustedTop >= mainTextTop && slopeAdjustedTop <= mainTextBottom && slopeAdjustedLeft >= 0 && slopeAdjustedLeft < this.getWidth()) { for (int i = 0; i < shape.getWidth(); i++) { if (slopeAdjustedLeft + i < this.getWidth()) verticalCounts[slopeAdjustedLeft + i] += shape.getHeight(); } } } } DescriptiveStatistics verticalStats = new DescriptiveStatistics(); DescriptiveStatistics verticalStatsNonEmpty = new DescriptiveStatistics(); for (int i = 0; i < this.getWidth(); i++) { // LOG.trace("Column " + i + ": " + verticalCounts[i]); verticalStats.addValue(verticalCounts[i]); if (verticalCounts[i] > 0) verticalStatsNonEmpty.addValue(verticalCounts[i]); } LOG.debug("Mean vertical count: " + verticalStats.getMean()); LOG.debug("Median vertical count: " + verticalStats.getPercentile(50)); LOG.debug("25 percentile vertical count: " + verticalStats.getPercentile(25)); LOG.debug("Mean vertical count (non empty): " + verticalStatsNonEmpty.getMean()); LOG.debug("Median vertical count (non empty): " + verticalStatsNonEmpty.getPercentile(50)); LOG.debug("25 percentile vertical count (non empty): " + verticalStatsNonEmpty.getPercentile(25)); LOG.debug("10 percentile vertical count (non empty): " + verticalStatsNonEmpty.getPercentile(10)); LOG.debug("1 percentile vertical count (non empty): " + verticalStatsNonEmpty.getPercentile(1)); // double maxEmptyColumnCount = verticalStatsNonEmpty.getMean() / 8.0; double maxEmptyColumnCount = verticalStatsNonEmpty.getPercentile(1); LOG.debug("maxEmptyColumnCount: " + maxEmptyColumnCount); boolean inEmptyVerticalRange = false; List<int[]> emptyVerticalRanges = new ArrayList<int[]>(); int emptyVerticalRangeStart = 0; for (int i = 0; i < this.getWidth(); i++) { if (!inEmptyVerticalRange && verticalCounts[i] <= maxEmptyColumnCount) { inEmptyVerticalRange = true; emptyVerticalRangeStart = i; } else if (inEmptyVerticalRange && verticalCounts[i] > maxEmptyColumnCount) { inEmptyVerticalRange = false; emptyVerticalRanges.add(new int[] { emptyVerticalRangeStart, i }); } } if (inEmptyVerticalRange) { emptyVerticalRanges.add(new int[] { emptyVerticalRangeStart, this.getWidth() - 1 }); } LOG.debug("rowXHeight mean: " + rowXHeightStats.getMean()); LOG.debug("rowXHeight median: " + rowXHeightStats.getPercentile(50)); double minVerticalBreak = rowXHeightStats.getMean() * 1.0; LOG.debug("minVerticalBreak: " + minVerticalBreak); List<int[]> columnBreaks = new ArrayList<int[]>(); for (int[] emptyVerticalRange : emptyVerticalRanges) { int width = emptyVerticalRange[1] - emptyVerticalRange[0]; LOG.trace("empty range: " + emptyVerticalRange[0] + ", " + emptyVerticalRange[1] + " = " + width); if (width >= minVerticalBreak) { columnBreaks.add(emptyVerticalRange); LOG.trace("Found column break!"); } } columnSeparators = new ArrayList<Rectangle>(); for (int[] columnBreak : columnBreaks) { // reduce the column break to the thickest empty area if possible int[] bestColumnBreak = null; double originalCount = maxEmptyColumnCount; maxEmptyColumnCount = 0; while (bestColumnBreak == null && maxEmptyColumnCount <= originalCount) { inEmptyVerticalRange = false; emptyVerticalRanges = new ArrayList<int[]>(); emptyVerticalRangeStart = columnBreak[0]; for (int i = columnBreak[0]; i <= columnBreak[1]; i++) { if (!inEmptyVerticalRange && verticalCounts[i] <= maxEmptyColumnCount) { inEmptyVerticalRange = true; emptyVerticalRangeStart = i; } else if (inEmptyVerticalRange && verticalCounts[i] > maxEmptyColumnCount) { inEmptyVerticalRange = false; emptyVerticalRanges.add(new int[] { emptyVerticalRangeStart, i }); } } if (inEmptyVerticalRange) { emptyVerticalRanges.add(new int[] { emptyVerticalRangeStart, columnBreak[1] }); } for (int[] emptyVerticalRange : emptyVerticalRanges) { if (bestColumnBreak == null || (emptyVerticalRange[1] - emptyVerticalRange[0] > bestColumnBreak[1] - bestColumnBreak[0])) bestColumnBreak = emptyVerticalRange; } maxEmptyColumnCount += (originalCount / 8.0); } if (bestColumnBreak == null) bestColumnBreak = columnBreak; Rectangle whiteArea = new WhiteArea(bestColumnBreak[0], mainTextTop, bestColumnBreak[1], mainTextBottom); columnSeparators.add(whiteArea); LOG.debug("ColumnBreak: " + whiteArea); } // next column break } return columnSeparators; }
From source file:com.joliciel.jochre.graphics.RowOfShapesImpl.java
/** * The regression passes through the bottom of average shapes on this line. * It gives the line's slope, and a starting point for finding the baseline and meanline. *///from w w w. j ava2 s . c o m public SimpleRegression getRegression() { if (this.regression == null) { // begin by calculating some sort of average line crossing the whole row, so that we can see if the row is // rising or falling to start with? // Calculate the line crossing the mid-point of all "average" shapes on this row // get the "smoothed" linear approximation of the mid-points regression = new SimpleRegression(); int numShapes = 0; int minShapes = 10; DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics(); for (Shape shape : this.getShapes()) { shapeWidthStats.addValue(shape.getWidth()); shapeHeightStats.addValue(shape.getHeight()); } double minWidth = shapeWidthStats.getPercentile(25); double maxWidth = shapeWidthStats.getPercentile(75); double minHeight = shapeHeightStats.getPercentile(25); double maxHeight = shapeHeightStats.getPercentile(75); for (Shape shape : this.getShapes()) { // only add points whose shape is of "average" width and height (to leave out commas, etc.) if (shape.getWidth() >= minWidth && shape.getWidth() <= maxWidth && shape.getHeight() >= minHeight && shape.getHeight() <= maxHeight) { // using bottom only, since rows with different font sizes tend to align bottom regression.addData((((double) shape.getLeft() + (double) shape.getRight()) / 2.0), ((double) shape.getBottom())); numShapes++; } } // special case where row contains very few shapes (generally letter or number + period) boolean horizontalLine = false; if (numShapes < minShapes) { LOG.debug("Too few shapes: " + numShapes + ", assuming straight horizontal line"); horizontalLine = true; } else if ((this.getRight() - this.getLeft()) < (this.getContainer().getWidth() / 6.0)) { LOG.debug("Too narrow: " + (this.getRight() - this.getLeft()) + ", assuming straight horizontal line"); horizontalLine = true; } if (horizontalLine) { // assume a straight horizontal line Mean midPointMean = new Mean(); for (Shape shape : this.getShapes()) { // only add points whose shape is of "average" height (to leave out commas, etc.) if (shape.getWidth() >= minWidth && shape.getWidth() <= maxWidth && shape.getHeight() >= minHeight && shape.getHeight() <= maxHeight) { midPointMean.increment((double) shape.getBottom()); } } if (midPointMean.getN() == 0) { for (Shape shape : this.getShapes()) { midPointMean.increment((double) shape.getBottom()); } } double meanMidPoint = midPointMean.getResult(); regression = new SimpleRegression(); regression.addData(this.getLeft(), meanMidPoint); regression.addData(this.getRight(), meanMidPoint); } // displays intercept of regression line LOG.debug("intercept: " + regression.getIntercept()); // displays slope of regression line LOG.debug("slope: " + regression.getSlope()); // displays slope standard error LOG.debug("std err: " + regression.getSlopeStdErr()); LOG.debug("x = 0, y = " + regression.predict(0)); LOG.debug("x = " + this.getContainer().getWidth() + ", y = " + regression.predict(this.getContainer().getWidth())); } return regression; }
From source file:com.joliciel.jochre.graphics.RowOfShapesImpl.java
/** * Assign guidelines for a certain subset of shapes, and return the x-height. * @param startShape/*from w ww.j a v a 2 s.c o m*/ * @param endShape * @return */ int assignGuideLines(List<GroupOfShapes> groupsToAssign) { LOG.debug("assignGuideLines internal"); double meanHorizontalSlope = this.getContainer().getMeanHorizontalSlope(); // the base-line and mean-line will be at a fixed distance away from the midpoint // the question is, which distance! // To find this out, we count number of black pixels on each row above this line // And then start analysing from the top and the bottom until the number drops off sharply // The notion of "groupsToAssign" is used to only assign guidelines // to a subset of the groups on the line // when the line contains two different font sizes List<Shape> shapes = new ArrayList<Shape>(); if (groupsToAssign != null) { for (GroupOfShapes group : groupsToAssign) { shapes.addAll(group.getShapes()); } } else { shapes = this.getShapes(); } int i = 0; DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics(); for (Shape shape : this.getShapes()) { shapeWidthStats.addValue(shape.getWidth()); shapeHeightStats.addValue(shape.getHeight()); } double minWidth = shapeWidthStats.getPercentile(25); double maxWidth = shapeWidthStats.getPercentile(75); double minHeight = shapeHeightStats.getPercentile(45); double maxHeight = shapeHeightStats.getPercentile(75); double rowMidPointX = (double) (this.getLeft() + this.getRight()) / 2.0; // calculating the Y midpoint by the shapes in the row, instead of by the top & bottom of row Mean rowMidPointYMean = new Mean(); for (Shape shape : this.getShapes()) { // only add points whose shape is of "average" width and height (to leave out commas, etc.) if (shape.getWidth() >= minWidth && shape.getWidth() <= maxWidth && shape.getHeight() >= minHeight && shape.getHeight() <= maxHeight) { rowMidPointYMean.increment((double) (shape.getBottom() + shape.getTop()) / 2.0); } } double rowMidPointY = (double) (this.getTop() + this.getBottom()) / 2.0; if (rowMidPointYMean.getN() > 0) rowMidPointY = rowMidPointYMean.getResult(); LOG.debug("rowMidPointX: " + rowMidPointX); LOG.debug("rowMidPointY: " + rowMidPointY); // figure out where the top-most shape starts and the bottom-most shape ends, relative to the y midline int minTop = Integer.MAX_VALUE; int maxBottom = Integer.MIN_VALUE; List<Integer> rowYMidPoints = new ArrayList<Integer>(shapes.size()); for (Shape shape : shapes) { double shapeMidPointX = (double) (shape.getLeft() + shape.getRight()) / 2.0; int shapeMidPointY = (int) Math .round(rowMidPointY + (meanHorizontalSlope * (shapeMidPointX - rowMidPointX))); rowYMidPoints.add(shapeMidPointY); int relativeTop = shape.getTop() - shapeMidPointY; int relativeBottom = shape.getBottom() - shapeMidPointY; if (relativeTop < minTop) minTop = relativeTop; if (relativeBottom > maxBottom) maxBottom = relativeBottom; } if (minTop > 0) minTop = 0; if (maxBottom < 0) maxBottom = 0; int yIntervalTop = 0 - minTop; int yIntervalBottom = maxBottom; int yInterval = yIntervalTop + 1 + yIntervalBottom; LOG.debug("yIntervalTop: " + yIntervalTop); LOG.debug("yIntervalBottom: " + yIntervalBottom); LOG.debug("yInterval: " + yInterval); int[] pixelCounts = new int[yInterval]; // Get the pixel count for each row // examining one shape at a time to limit ourselves to the pixels that are // actually considered to be in this row int blackThreshold = this.getContainer().getSeparationThreshold(); int shapeIndex = 0; int shapeCount = 0; for (Shape shape : shapes) { if (shape.getHeight() >= minHeight) { LOG.trace(shape.toString()); shapeCount++; int shapeMidPointY = rowYMidPoints.get(shapeIndex); int zeroLine = shapeMidPointY - yIntervalTop; int topIndex = shape.getTop() - zeroLine; for (int x = 0; x < shape.getWidth(); x++) { for (int y = 0; y < shape.getHeight(); y++) { int yIndex = topIndex + y; if (yIndex >= 0 && yIndex < pixelCounts.length && shape.isPixelBlack(x, y, blackThreshold)) { pixelCounts[yIndex]++; } } } } shapeIndex++; } LOG.debug("Got pixels from " + shapeCount + " shapes."); boolean notEnoughShapes = shapeCount < 3; LOG.debug("notEnoughShapes? " + notEnoughShapes); // We start at the top // As soon as we reach a line with more pixels than the mean, we assume this is the mean-line Mean pixelCountMeanTop = new Mean(); StandardDeviation pixelCountStdDevTop = new StandardDeviation(); for (i = 0; i <= yIntervalTop; i++) { pixelCountMeanTop.increment(pixelCounts[i]); pixelCountStdDevTop.increment(pixelCounts[i]); } LOG.debug("Top: pixel count mean: " + pixelCountMeanTop.getResult() + ", std dev: " + pixelCountStdDevTop.getResult()); double threshold = pixelCountMeanTop.getResult() * 1.1; if (notEnoughShapes) { threshold = threshold / 2.0; } double lowerThreshold = threshold / 2.0; LOG.debug("Top threshold: " + threshold); LOG.debug("Top lowerThreshold: " + lowerThreshold); int meanLine = 0; boolean findMeanLine = true; for (i = 0; i <= yIntervalTop; i++) { int pixelCount = pixelCounts[i]; if (findMeanLine && pixelCount > threshold) { meanLine = i; findMeanLine = false; } else if (!findMeanLine && pixelCount < lowerThreshold) { findMeanLine = true; } } // We start at the bottom // As soon as we reach a line with more pixels than the mean, we assume this is the base-line Mean pixelCountMeanBottom = new Mean(); StandardDeviation pixelCountStdDevBottom = new StandardDeviation(); for (i = pixelCounts.length - 1; i >= yIntervalTop; i--) { pixelCountMeanBottom.increment(pixelCounts[i]); pixelCountStdDevBottom.increment(pixelCounts[i]); } LOG.debug("Bottom: pixel count mean: " + pixelCountMeanBottom.getResult() + ", std dev: " + pixelCountStdDevBottom.getResult()); threshold = pixelCountMeanBottom.getResult() * 1.1; if (notEnoughShapes) { threshold = threshold / 2.0; } lowerThreshold = threshold / 2.0; LOG.debug("Bottom threshold: " + threshold); LOG.debug("Bottom lowerThreshold: " + lowerThreshold); int baseLine = meanLine; boolean findBaseLine = true; for (i = pixelCounts.length - 1; i >= yIntervalTop; i--) { int pixelCount = pixelCounts[i]; if (findBaseLine && pixelCount > threshold) { baseLine = i; findBaseLine = false; } else if (!findBaseLine && pixelCount < lowerThreshold) { findBaseLine = true; } } for (i = 0; i < yInterval; i++) { int pixelCount = pixelCounts[i]; if (i == meanLine) LOG.trace("======= MEAN LINE " + i + " =========="); LOG.trace("pixel row " + i + ". pixel count " + pixelCount); if (i == baseLine) LOG.trace("======= BASE LINE " + i + " =========="); } // assign base lines and mean lines to each shape shapeIndex = 0; for (Shape shape : shapes) { int shapeMidPointY = rowYMidPoints.get(shapeIndex); int yMeanline = (shapeMidPointY - yIntervalTop) + meanLine; int yBaseline = (shapeMidPointY - yIntervalTop) + baseLine; LOG.trace(shape.toString() + ", meanLine: " + (yMeanline - shape.getTop()) + ", baseLine: " + (yBaseline - shape.getTop())); shape.setBaseLine(yBaseline - shape.getTop()); shape.setMeanLine(yMeanline - shape.getTop()); shapeIndex++; } // next shape int xHeight = baseLine - meanLine; return xHeight; }
From source file:com.joliciel.talismane.other.corpus.CorpusStatistics.java
@Override public void onNextParseConfiguration(ParseConfiguration parseConfiguration, Writer writer) { sentenceCount++;/*w ww .j a v a 2s. c o m*/ sentenceLengthStats.addValue(parseConfiguration.getPosTagSequence().size()); for (PosTaggedToken posTaggedToken : parseConfiguration.getPosTagSequence()) { if (posTaggedToken.getTag().equals(PosTag.ROOT_POS_TAG)) continue; Token token = posTaggedToken.getToken(); String word = token.getOriginalText(); words.add(word); if (referenceWords != null) { if (!referenceWords.contains(word)) unknownTokenCount++; } if (alphanumeric.matcher(token.getOriginalText()).find()) { String lowercase = word.toLowerCase(TalismaneSession.getLocale()); lowerCaseWords.add(lowercase); alphanumericCount++; if (referenceLowercaseWords != null) { if (!referenceLowercaseWords.contains(lowercase)) unknownAlphanumericCount++; } } tokenCount++; Integer countObj = posTagCounts.get(posTaggedToken.getTag().getCode()); int count = countObj == null ? 0 : countObj.intValue(); count++; posTagCounts.put(posTaggedToken.getTag().getCode(), count); } int maxDepth = 0; DescriptiveStatistics avgSyntaxDepthForSentenceStats = new DescriptiveStatistics(); for (DependencyArc arc : parseConfiguration.getDependencies()) { Integer countObj = depLabelCounts.get(arc.getLabel()); int count = countObj == null ? 0 : countObj.intValue(); count++; depLabelCounts.put(arc.getLabel(), count); totalDepCount++; if (arc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && (arc.getLabel() == null || arc.getLabel().length() == 0)) { // do nothing for unattached stuff (e.g. punctuation) } else if (arc.getLabel().equals("ponct")) { // do nothing for punctuation } else { int depth = 0; DependencyArc theArc = arc; while (theArc != null && !theArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG)) { theArc = parseConfiguration.getGoverningDependency(theArc.getHead()); depth++; } if (depth > maxDepth) maxDepth = depth; syntaxDepthStats.addValue(depth); avgSyntaxDepthForSentenceStats.addValue(depth); int distance = Math .abs(arc.getHead().getToken().getIndex() - arc.getDependent().getToken().getIndex()); syntaxDistanceStats.addValue(distance); } maxSyntaxDepthStats.addValue(maxDepth); if (avgSyntaxDepthForSentenceStats.getN() > 0) avgSyntaxDepthStats.addValue(avgSyntaxDepthForSentenceStats.getMean()); } // we cheat a little bit by only allowing each arc to count once // there could be a situation where there are two independent non-projective arcs // crossing the same mother arc, but we prefer here to underestimate, // as this phenomenon is quite rare. Set<DependencyArc> nonProjectiveArcs = new HashSet<DependencyArc>(); int i = 0; for (DependencyArc arc : parseConfiguration.getDependencies()) { i++; if (arc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && (arc.getLabel() == null || arc.getLabel().length() == 0)) continue; if (nonProjectiveArcs.contains(arc)) continue; int headIndex = arc.getHead().getToken().getIndex(); int depIndex = arc.getDependent().getToken().getIndex(); int startIndex = headIndex < depIndex ? headIndex : depIndex; int endIndex = headIndex >= depIndex ? headIndex : depIndex; int j = 0; for (DependencyArc otherArc : parseConfiguration.getDependencies()) { j++; if (j <= i) continue; if (otherArc.getHead().getTag().equals(PosTag.ROOT_POS_TAG) && (otherArc.getLabel() == null || otherArc.getLabel().length() == 0)) continue; if (nonProjectiveArcs.contains(otherArc)) continue; int headIndex2 = otherArc.getHead().getToken().getIndex(); int depIndex2 = otherArc.getDependent().getToken().getIndex(); int startIndex2 = headIndex2 < depIndex2 ? headIndex2 : depIndex2; int endIndex2 = headIndex2 >= depIndex2 ? headIndex2 : depIndex2; boolean nonProjective = false; if (startIndex2 < startIndex && endIndex2 > startIndex && endIndex2 < endIndex) { nonProjective = true; } else if (startIndex2 > startIndex && startIndex2 < endIndex && endIndex2 > endIndex) { nonProjective = true; } if (nonProjective) { nonProjectiveArcs.add(arc); nonProjectiveArcs.add(otherArc); nonProjectiveCount++; LOG.debug("Non-projective arcs in sentence: " + parseConfiguration.getSentence().getText()); LOG.debug(arc.toString()); LOG.debug(otherArc.toString()); break; } } } }
From source file:edu.usc.goffish.gopher.sample.stats.N_Hop_Stats.java
@Override public void compute(List<SubGraphMessage> messageList) { if (getIteration() == 0 && getSuperStep() == 0) { String data = new String(messageList.get(0).getData()); // debugLog("GOT DATA initial :" + data); hopCount = Integer.parseInt(data); try {// ww w . j a v a 2 s . c om init(); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(); } } long ls = System.currentTimeMillis(); ISubgraphInstance instance = getCurrentInstance(); if (instance == null) { // debugLog("Instance == null : " + getIteration()); voteToHalt(); haultApp(); return; } if (getSuperStep() == 0) { DescriptiveStatistics statistics = new DescriptiveStatistics(); long diskTimeStart = System.currentTimeMillis(); if (!instance.hasProperties()) { // debugLog("No Properties : " + getIteration()); voteToHalt(); return; } debugLog("INSTANCE_LOAD," + subgraph.getId() + "," + (System.currentTimeMillis() - diskTimeStart) + "," + getSuperStep() + "," + getIteration()); long travasalS = System.currentTimeMillis(); // DescriptiveStatistics edgePropLoadTimeStats = new DescriptiveStatistics(); for (ITemplateEdge edge : subgraph.edges()) { // long edgePropStart = System.currentTimeMillis(); ISubgraphObjectProperties edgeProps = instance.getPropertiesForEdge(edge.getId()); // edgePropLoadTimeStats.addValue(System.currentTimeMillis() - edgePropStart); String[] latencies = ((String) edgeProps.getValue(LATENCY_PROP)) == null ? null : ((String) edgeProps.getValue(LATENCY_PROP)).split(","); String[] hops = ((String) edgeProps.getValue(HOP_PROP)) == null ? null : ((String) edgeProps.getValue(HOP_PROP)).split(","); if (hops != null && latencies != null) { for (int i = 0; i < hops.length; i++) { String h = hops[i]; if (hopCount == Integer.parseInt(h)) { // debugLog("HOP : " + h + ": Latency : " + latencies[i]); double latency = Double.parseDouble(latencies[i]); statistics.addValue(latency); } } } } //debugLog("Travasal total : " + (System.currentTimeMillis() - travasalS)); //debugLog("Edge Load Time max,avg:" + edgePropLoadTimeStats.getMax() + "," + edgePropLoadTimeStats.getMean()); String data = "1:" + statistics.getMean(); if (!"1:nan".equalsIgnoreCase(data)) { SubGraphMessage message = new SubGraphMessage(data.getBytes()); sendMessage(partition.getId(), message); //debugLog("Sub-graph data sent : " + data); } voteToHalt(); } else { if (acquireLock("N_HOP_" + partition.getId() + " _" + getIteration() + "_" + getSuperStep())) { //debugLog("Lock Acqured"); DescriptiveStatistics statistics = new DescriptiveStatistics(); boolean finalStage = false; for (SubGraphMessage msg : messageList) { String data = new String(msg.getData()); //debugLog("Partittion got data : " + data); String[] parts = data.split(":"); if ("1".equals(parts[0].trim())) { if (!parts[1].equalsIgnoreCase("nan")) { statistics.addValue(Double.parseDouble(parts[1])); //debugLog("Stage 1 data added : " + parts[1]); } } else { finalStage = true; if (!parts[1].equalsIgnoreCase("nan")) { statistics.addValue(Double.parseDouble(parts[1])); //debugLog("Stage 2 data added : " + parts[1]); } } } if (finalStage) { try { String data = "" + statistics.getMean(); try { Double.parseDouble(data); sendMessageToReduceStep(new SubGraphMessage(data.getBytes())); } catch (Exception e) { } PrintWriter writer = new PrintWriter(new FileWriter("Hop_Stats.log", true)); log(writer, hopCount, statistics.getMean(), currentInstance.getTimestampStart()); } catch (Exception e) { e.printStackTrace(); } voteToHalt(); } else { String data = "2:" + statistics.getMean(); if (!"2:nan".equalsIgnoreCase(data)) { SubGraphMessage message = new SubGraphMessage(data.getBytes()); for (int i : partitions) { sendMessage(i, message); } //debugLog("Stage 2 data sent :" + data); } voteToHalt(); } } else { voteToHalt(); } } }
From source file:datafu.hourglass.jobs.StagedOutputJob.java
/** * Writes Hadoop counters and other task statistics to a file in the file system. * //from w w w.ja v a 2 s . c o m * @param fs * @throws IOException */ private void writeCounters(final FileSystem fs) throws IOException { final Path actualOutputPath = FileOutputFormat.getOutputPath(this); SimpleDateFormat timestampFormat = new SimpleDateFormat("yyyyMMddHHmmss"); String suffix = timestampFormat.format(new Date()); if (_countersParentPath != null) { if (!fs.exists(_countersParentPath)) { _log.info("Creating counter parent path " + _countersParentPath); fs.mkdirs(_countersParentPath, FsPermission.valueOf("-rwxrwxr-x")); } // make the name as unique as possible in this case because this may be a directory // where other counter files will be dropped _countersPath = new Path(_countersParentPath, ".counters." + suffix); } else { _countersPath = new Path(actualOutputPath, ".counters." + suffix); } _log.info(String.format("Writing counters to %s", _countersPath)); FSDataOutputStream counterStream = fs.create(_countersPath); BufferedOutputStream buffer = new BufferedOutputStream(counterStream, 256 * 1024); OutputStreamWriter writer = new OutputStreamWriter(buffer); for (String groupName : getCounters().getGroupNames()) { for (Counter counter : getCounters().getGroup(groupName)) { writeAndLog(writer, String.format("%s=%d", counter.getName(), counter.getValue())); } } JobID jobID = this.getJobID(); org.apache.hadoop.mapred.JobID oldJobId = new org.apache.hadoop.mapred.JobID(jobID.getJtIdentifier(), jobID.getId()); long minStart = Long.MAX_VALUE; long maxFinish = 0; long setupStart = Long.MAX_VALUE; long cleanupFinish = 0; DescriptiveStatistics mapStats = new DescriptiveStatistics(); DescriptiveStatistics reduceStats = new DescriptiveStatistics(); boolean success = true; JobClient jobClient = new JobClient(this.conf); Map<String, String> taskIdToType = new HashMap<String, String>(); TaskReport[] setupReports = jobClient.getSetupTaskReports(oldJobId); if (setupReports.length > 0) { _log.info("Processing setup reports"); for (TaskReport report : jobClient.getSetupTaskReports(oldJobId)) { taskIdToType.put(report.getTaskID().toString(), "SETUP"); if (report.getStartTime() == 0) { _log.warn("Skipping report with zero start time"); continue; } setupStart = Math.min(setupStart, report.getStartTime()); } } else { _log.error("No setup reports"); } TaskReport[] mapReports = jobClient.getMapTaskReports(oldJobId); if (mapReports.length > 0) { _log.info("Processing map reports"); for (TaskReport report : mapReports) { taskIdToType.put(report.getTaskID().toString(), "MAP"); if (report.getFinishTime() == 0 || report.getStartTime() == 0) { _log.warn("Skipping report with zero start or finish time"); continue; } minStart = Math.min(minStart, report.getStartTime()); mapStats.addValue(report.getFinishTime() - report.getStartTime()); } } else { _log.error("No map reports"); } TaskReport[] reduceReports = jobClient.getReduceTaskReports(oldJobId); if (reduceReports.length > 0) { _log.info("Processing reduce reports"); for (TaskReport report : reduceReports) { taskIdToType.put(report.getTaskID().toString(), "REDUCE"); if (report.getFinishTime() == 0 || report.getStartTime() == 0) { _log.warn("Skipping report with zero start or finish time"); continue; } maxFinish = Math.max(maxFinish, report.getFinishTime()); reduceStats.addValue(report.getFinishTime() - report.getStartTime()); } } else { _log.error("No reduce reports"); } TaskReport[] cleanupReports = jobClient.getCleanupTaskReports(oldJobId); if (cleanupReports.length > 0) { _log.info("Processing cleanup reports"); for (TaskReport report : cleanupReports) { taskIdToType.put(report.getTaskID().toString(), "CLEANUP"); if (report.getFinishTime() == 0) { _log.warn("Skipping report with finish time of zero"); continue; } cleanupFinish = Math.max(cleanupFinish, report.getFinishTime()); } } else { _log.error("No cleanup reports"); } if (minStart == Long.MAX_VALUE) { _log.error("Could not determine map-reduce start time"); success = false; } if (maxFinish == 0) { _log.error("Could not determine map-reduce finish time"); success = false; } if (setupStart == Long.MAX_VALUE) { _log.error("Could not determine setup start time"); success = false; } if (cleanupFinish == 0) { _log.error("Could not determine cleanup finish time"); success = false; } // Collect statistics on successful/failed/killed task attempts, categorized by setup/map/reduce/cleanup. // Unfortunately the job client doesn't have an easier way to get these statistics. Map<String, Integer> attemptStats = new HashMap<String, Integer>(); _log.info("Processing task attempts"); for (TaskCompletionEvent event : getTaskCompletionEvents(jobClient, oldJobId)) { String type = taskIdToType.get(event.getTaskAttemptId().getTaskID().toString()); String status = event.getTaskStatus().toString(); String key = String.format("%s_%s_ATTEMPTS", status, type); if (!attemptStats.containsKey(key)) { attemptStats.put(key, 0); } attemptStats.put(key, attemptStats.get(key) + 1); } if (success) { writeAndLog(writer, String.format("SETUP_START_TIME_MS=%d", setupStart)); writeAndLog(writer, String.format("CLEANUP_FINISH_TIME_MS=%d", cleanupFinish)); writeAndLog(writer, String.format("COMPLETE_WALL_CLOCK_TIME_MS=%d", cleanupFinish - setupStart)); writeAndLog(writer, String.format("MAP_REDUCE_START_TIME_MS=%d", minStart)); writeAndLog(writer, String.format("MAP_REDUCE_FINISH_TIME_MS=%d", maxFinish)); writeAndLog(writer, String.format("MAP_REDUCE_WALL_CLOCK_TIME_MS=%d", maxFinish - minStart)); writeAndLog(writer, String.format("MAP_TOTAL_TASKS=%d", (long) mapStats.getN())); writeAndLog(writer, String.format("MAP_MAX_TIME_MS=%d", (long) mapStats.getMax())); writeAndLog(writer, String.format("MAP_MIN_TIME_MS=%d", (long) mapStats.getMin())); writeAndLog(writer, String.format("MAP_AVG_TIME_MS=%d", (long) mapStats.getMean())); writeAndLog(writer, String.format("MAP_STD_TIME_MS=%d", (long) mapStats.getStandardDeviation())); writeAndLog(writer, String.format("MAP_SUM_TIME_MS=%d", (long) mapStats.getSum())); writeAndLog(writer, String.format("REDUCE_TOTAL_TASKS=%d", (long) reduceStats.getN())); writeAndLog(writer, String.format("REDUCE_MAX_TIME_MS=%d", (long) reduceStats.getMax())); writeAndLog(writer, String.format("REDUCE_MIN_TIME_MS=%d", (long) reduceStats.getMin())); writeAndLog(writer, String.format("REDUCE_AVG_TIME_MS=%d", (long) reduceStats.getMean())); writeAndLog(writer, String.format("REDUCE_STD_TIME_MS=%d", (long) reduceStats.getStandardDeviation())); writeAndLog(writer, String.format("REDUCE_SUM_TIME_MS=%d", (long) reduceStats.getSum())); writeAndLog(writer, String.format("MAP_REDUCE_SUM_TIME_MS=%d", (long) mapStats.getSum() + (long) reduceStats.getSum())); for (Map.Entry<String, Integer> attemptStat : attemptStats.entrySet()) { writeAndLog(writer, String.format("%s=%d", attemptStat.getKey(), attemptStat.getValue())); } } writer.close(); buffer.close(); counterStream.close(); }