List of usage examples for org.apache.commons.math.stat.descriptive DescriptiveStatistics DescriptiveStatistics
public DescriptiveStatistics()
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
List<RowOfShapes> groupShapesIntoRows(SourceImage sourceImage, List<Shape> shapes, List<Rectangle> whiteAreas, boolean useSlope) { LOG.debug("########## groupShapesIntoRows #########"); LOG.debug("useSlope? " + useSlope); List<RowOfShapes> rows = new ArrayList<RowOfShapes>(); for (Shape shape : shapes) shape.setRow(null);//from w w w.j a v a2s. c o m List<Shape> shapesToRemove = new ArrayList<Shape>(); for (Shape shape : shapes) { for (Rectangle whiteArea : whiteAreas) { double whiteAreaRight = whiteArea.getRight(); double whiteAreaLeft = whiteArea.getLeft(); if (useSlope) { double xAdjustment = sourceImage.getXAdjustment(shape.getTop()); whiteAreaRight += xAdjustment; whiteAreaLeft += xAdjustment; } if (whiteAreaRight > shape.getRight() && whiteAreaLeft < shape.getLeft() && whiteArea.getTop() < shape.getTop() && whiteArea.getBottom() > shape.getBottom()) { // shape is surrounded shapesToRemove.add(shape); LOG.debug("Removing shape " + shape); LOG.debug("Surrounded by white area: " + whiteArea); } } } shapes.removeAll(shapesToRemove); // calculate the means // get average shape width & height DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); for (Shape shape : shapes) { shapeWidthStats.addValue(shape.getWidth()); } double averageShapeWidth = shapeWidthStats.getPercentile(50); LOG.debug("averageShapeWidth: " + averageShapeWidth); // now, arrange the shapes in rows // we're guaranteed that no two shapes overlap at this point. // Now, it's possible that two shapes in the same line have no vertical overlap (e.g. a comma and an apostrophe) // so we have to go searching a bit further afield, say five shapes in each direction // but if we go too far, we may end up joining two lines together if the page isn't quite straight // let's begin with any old shape and find the shapes closest to it horizontally // e.g. up to 8 horizontal means to the right and left // as we find shapes that go with it, we add them to the same line int i = 0; int j = 0; int numberOfMeanWidthsForSearch = 8; LOG.debug("numberOfMeanWidthsForSearch: " + numberOfMeanWidthsForSearch); LOG.debug("search distance: " + averageShapeWidth * numberOfMeanWidthsForSearch); for (Shape shape : shapes) { if (shape.getRow() == null) { RowOfShapes row = graphicsService.getEmptyRow(sourceImage); row.addShape(shape); row.setIndex(j++); rows.add(row); LOG.trace("========= New row " + row.getIndex() + "============"); LOG.trace("Adding " + shape + " to row " + row.getIndex()); } int searchLeft = (int) ((double) shape.getLeft() - (numberOfMeanWidthsForSearch * averageShapeWidth)); int searchRight = (int) ((double) shape.getRight() + (numberOfMeanWidthsForSearch * averageShapeWidth)); LOG.trace("Shape " + i++ + ": " + shape + "(row " + shape.getRow().getIndex() + ")"); LOG.trace("searchLeft: " + searchLeft); LOG.trace("searchRight: " + searchRight); // construct an array to represent where white areas overlap with the search area int[][] leftSearchArea = new int[shape.getLeft() - searchLeft][2]; int[][] rightSearchArea = new int[searchRight - shape.getRight()][2]; for (int k = 0; k < leftSearchArea.length; k++) { leftSearchArea[k][0] = shape.getTop(); leftSearchArea[k][1] = shape.getBottom(); } for (int k = 0; k < rightSearchArea.length; k++) { rightSearchArea[k][0] = shape.getTop(); rightSearchArea[k][1] = shape.getBottom(); } int newSearchLeft = searchLeft; int newSearchRight = searchRight; for (Rectangle whiteArea : whiteAreas) { double whiteAreaRight = whiteArea.getRight(); double whiteAreaLeft = whiteArea.getLeft(); if (useSlope) { double xAdjustment = sourceImage.getXAdjustment(shape.getTop()); whiteAreaRight += xAdjustment; whiteAreaLeft += xAdjustment; LOG.trace(whiteArea + ", xAdjustment=" + xAdjustment + " , whiteAreaLeft=" + whiteAreaLeft + " , whiteAreaRight=" + whiteAreaRight); } if (whiteAreaRight > newSearchLeft && whiteAreaLeft < shape.getLeft() && whiteArea.getTop() <= shape.getBottom() && whiteArea.getBottom() >= shape.getTop()) { LOG.trace("overlap on left with: " + whiteArea.toString()); if (whiteArea.getTop() <= shape.getTop() && whiteArea.getBottom() >= shape.getBottom() && whiteAreaRight > newSearchLeft) { newSearchLeft = (int) Math.round(whiteAreaRight); LOG.trace("Complete, newSearchLeft = " + newSearchLeft); } else { LOG.trace("Partial, starting at " + whiteArea.getRight()); for (int k = whiteArea.getRight() - searchLeft; k >= 0; k--) { if (k < leftSearchArea.length) { if (whiteArea.getBottom() < shape.getBottom() && leftSearchArea[k][0] < whiteArea.getBottom()) leftSearchArea[k][0] = whiteArea.getBottom() + 1; else if (whiteArea.getTop() > shape.getTop() && leftSearchArea[k][1] > whiteArea.getTop()) leftSearchArea[k][1] = whiteArea.getTop() - 1; if (leftSearchArea[k][0] >= leftSearchArea[k][1] && searchLeft + k > newSearchLeft) { newSearchLeft = searchLeft + k; LOG.trace("Complete from " + newSearchLeft); break; } } } // if (LOG.isTraceEnabled()) { // StringBuilder sb = new StringBuilder(); // for (int k=0;k<leftSearchArea.length;k++) { // String top = "" + (leftSearchArea[k][0]-shape.getTop()); // sb.append(String.format("%1$#" + 3 + "s", top)+ ","); // } // LOG.trace(sb.toString()); // sb = new StringBuilder(); // for (int k=0;k<leftSearchArea.length;k++) { // String bottom = "" + (leftSearchArea[k][1]-shape.getTop()); // sb.append(String.format("%1$#" + 3 + "s", bottom)+ ","); // } // LOG.trace(sb.toString()); // } } } else if (whiteAreaLeft < newSearchRight && whiteAreaRight > shape.getRight() && whiteArea.getTop() <= shape.getBottom() && whiteArea.getBottom() >= shape.getTop()) { LOG.trace("overlap on right with: " + whiteArea.toString()); if (whiteArea.getTop() <= shape.getTop() && whiteArea.getBottom() >= shape.getBottom() && newSearchRight > whiteAreaLeft) { newSearchRight = (int) Math.round(whiteAreaLeft); LOG.trace("Complete, newSearchRight = " + newSearchRight); } else { LOG.trace("Partial, starting at " + whiteArea.getLeft()); for (int k = whiteArea.getLeft() - shape.getRight(); k < rightSearchArea.length; k++) { if (k > 0 && k < leftSearchArea.length && k < rightSearchArea.length) { if (whiteArea.getBottom() < shape.getBottom() && leftSearchArea[k][0] < whiteArea.getBottom()) rightSearchArea[k][0] = whiteArea.getBottom() + 1; else if (whiteArea.getTop() > shape.getTop() && leftSearchArea[k][1] > whiteArea.getTop()) rightSearchArea[k][1] = whiteArea.getTop() - 1; if (rightSearchArea[k][0] >= rightSearchArea[k][1] && newSearchRight > shape.getRight() + k) { newSearchRight = shape.getRight() + k; LOG.trace("Complete from " + newSearchRight); break; } } } // if (LOG.isTraceEnabled()) { // StringBuilder sb = new StringBuilder(); // for (int k=0;k<rightSearchArea.length;k++) { // String top = "" + (rightSearchArea[k][0]-shape.getTop()); // sb.append(String.format("%1$#" + 3 + "s", top)+ ","); // } // LOG.trace(sb.toString()); // sb = new StringBuilder(); // for (int k=0;k<rightSearchArea.length;k++) { // String bottom = "" + (rightSearchArea[k][1]-shape.getTop()); // sb.append(String.format("%1$#" + 3 + "s", bottom)+ ","); // } // LOG.trace(sb.toString()); // } } } } LOG.trace("searchLeft adjusted for white columns: " + newSearchLeft); LOG.trace("searchRight adjusted for white columns: " + newSearchRight); // min 10% overlap to assume same row double minOverlap = 0.10; for (Shape otherShape : shapes) { boolean haveSomeOverlap = false; if (!shape.getRow().equals(otherShape.getRow()) && !otherShape.equals(shape)) { // shapes are arranged from the top down if (otherShape.getTop() > shape.getBottom()) { break; } if (otherShape.getRight() > newSearchLeft && otherShape.getRight() < shape.getLeft() && otherShape.getTop() <= shape.getBottom() && otherShape.getBottom() >= shape.getTop()) { int k = otherShape.getRight() - searchLeft; if (otherShape.getTop() <= leftSearchArea[k][1] && otherShape.getBottom() >= leftSearchArea[k][0]) haveSomeOverlap = true; } else if (otherShape.getLeft() < newSearchRight && otherShape.getLeft() > shape.getRight() && otherShape.getTop() <= shape.getBottom() && otherShape.getBottom() >= shape.getTop()) { int k = otherShape.getLeft() - shape.getRight(); if (otherShape.getTop() <= rightSearchArea[k][1] && otherShape.getBottom() >= rightSearchArea[k][0]) haveSomeOverlap = true; } if (haveSomeOverlap) { int overlap1 = shape.getBottom() - otherShape.getTop() + 1; int overlap2 = otherShape.getBottom() - shape.getTop() + 1; int overlap = overlap1 < overlap2 ? overlap1 : overlap2; boolean addShapeToRow = false; if ((((double) overlap / (double) shape.getHeight()) > minOverlap) || (((double) overlap / (double) otherShape.getHeight()) > minOverlap)) { addShapeToRow = true; } if (addShapeToRow) { LOG.debug("Adding " + otherShape + " to row " + shape.getRow().getIndex()); if (otherShape.getRow() == null) { shape.getRow().addShape(otherShape); } else { // two rows need to be merged LOG.debug("========= Merge rows " + shape.getRow().getIndex() + " with " + otherShape.getRow().getIndex() + "=========="); RowOfShapes otherRow = otherShape.getRow(); shape.getRow().addShapes(otherRow.getShapes()); rows.remove(otherRow); } } } // add shape to row ? } // should shape be considered? } // next other shape } // next shape return rows; }
From source file:com.joliciel.jochre.lexicon.LexiconErrorWriter.java
static void mergeCrossValidation(File evalDir, String prefix) { try {// ww w . j av a 2 s. c om File[] files = evalDir.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { if (name.endsWith(".csv")) return true; else return false; } }); List<String> groupNames = new ArrayList<String>(); Map<String, Writer> writers = new HashMap<String, Writer>(); Map<String, ErrorStatistics> errorMap = new LinkedHashMap<String, ErrorStatistics>(); Map<String, Map<String, DescriptiveStatistics>> statMap = new HashMap<String, Map<String, DescriptiveStatistics>>(); for (File file : files) { String filename = file.getName(); LOG.debug("Processing " + filename); int index = Integer.parseInt(filename.substring(prefix.length(), prefix.length() + 1)); String suffix = filename.substring(prefix.length() + 2, filename.lastIndexOf('_')); String fileType = filename.substring(filename.lastIndexOf('_') + 1, filename.lastIndexOf('.')); LOG.debug("Processing " + filename); LOG.debug("index: " + index); LOG.debug("suffix: " + suffix); LOG.debug("fileType: " + fileType); Writer writer = writers.get(fileType); boolean firstFile = false; if (writer == null) { writer = new BufferedWriter(new OutputStreamWriter( new FileOutputStream( new File(evalDir, prefix + "A_" + suffix + "_" + fileType + ".csv"), false), "UTF8")); writers.put(fileType, writer); firstFile = true; } if (fileType.equals("KEMatrix")) { Scanner scanner = new Scanner(file); int i = 0; List<String> myGroupNames = new ArrayList<String>(); Map<String, Boolean> haveCountMap = new HashMap<String, Boolean>(); while (scanner.hasNextLine()) { String line = scanner.nextLine(); List<String> cells = CSV.getCSVCells(line); if (i == 0) { for (int j = 0; j < cells.size(); j += 5) { String groupName = cells.get(j); if (!errorMap.containsKey(groupName)) { errorMap.put(groupName, new ErrorStatistics()); statMap.put(groupName, new HashMap<String, DescriptiveStatistics>()); groupNames.add(groupName); } myGroupNames.add(groupName); } } else if (i == 1) { // do nothing } else { String rowName = cells.get(0); int j = 0; for (String groupName : myGroupNames) { ErrorStatistics errorStats = errorMap.get(groupName); Map<String, DescriptiveStatistics> stats = statMap.get(groupName); double correctCount = Double.parseDouble(cells.get(j * 5 + 1)); double errorCount = Double.parseDouble(cells.get(j * 5 + 2)); double totalCount = Double.parseDouble(cells.get(j * 5 + 3)); Boolean haveCount = haveCountMap.get(groupName); if (rowName.equals("known")) { errorStats.knownWordCorrectCount += correctCount; errorStats.knownWordErrorCount += errorCount; } else if (rowName.equals("unknown")) { errorStats.unknownWordCorrectCount += correctCount; errorStats.unknownWordErrorCount += errorCount; } else if (rowName.equals("goodSeg")) { errorStats.goodSegCorrectCount += correctCount; errorStats.goodSegErrorCount += errorCount; } else if (rowName.equals("badSeg")) { errorStats.badSegCorrectCount += correctCount; errorStats.badSegErrorCount += errorCount; } else if (rowName.equals("knownLetters")) { errorStats.knownWordCorrectLetterCount += correctCount; errorStats.knownWordErrorLetterCount += errorCount; } else if (rowName.equals("unknownLetters")) { errorStats.unknownWordCorrectLetterCount += correctCount; errorStats.unknownWordErrorLetterCount += errorCount; } else if (rowName.equals("goodSegLetters")) { errorStats.goodSegCorrectLetterCount += correctCount; errorStats.goodSegErrorLetterCount += errorCount; } else if (rowName.equals("badSegLetters")) { errorStats.badSegCorrectLetterCount += correctCount; errorStats.badSegErrorLetterCount += errorCount; } else if (rowName.equals("inBeam")) { errorStats.answerInBeamCorrectCount += correctCount; errorStats.answerInBeamErrorCount += errorCount; } else if (rowName.equals("total")) { haveCountMap.put(groupName, totalCount > 0); } else if (rowName.endsWith("%")) { if (haveCount) { String keyPrefix = rowName.substring(0, rowName.length() - 1); String key = keyPrefix + "|correct"; DescriptiveStatistics correctStat = stats.get(key); if (correctStat == null) { correctStat = new DescriptiveStatistics(); stats.put(key, correctStat); } correctStat.addValue(correctCount); key = keyPrefix + "|error"; DescriptiveStatistics errorStat = stats.get(key); if (errorStat == null) { errorStat = new DescriptiveStatistics(); stats.put(key, errorStat); } errorStat.addValue(errorCount); key = keyPrefix + "|total"; DescriptiveStatistics totalStat = stats.get(key); if (totalStat == null) { totalStat = new DescriptiveStatistics(); stats.put(key, totalStat); } totalStat.addValue(totalCount); } } j++; } } i++; } } else { Scanner scanner = new Scanner(file); boolean firstLine = true; while (scanner.hasNextLine()) { String line = scanner.nextLine(); if (firstLine) { if (firstFile) writer.write(line + "\n"); firstLine = false; } else { writer.write(line + "\n"); } writer.flush(); } } // file type } // next file Writer statsWriter = writers.get("KEMatrix"); writeStats(statsWriter, errorMap); statsWriter.write("\n"); String[] statTypes = new String[] { "known", "unknown", "goodSeg", "badSeg", "inBeam", "total", "knownLetter", "unknownLetter", "goodSegLetter", "badSegLetter", "totalLetter" }; for (String statType : statTypes) { for (String groupName : groupNames) { Map<String, DescriptiveStatistics> statsMap = statMap.get(groupName); DescriptiveStatistics correctStat = statsMap.get(statType + "|correct"); DescriptiveStatistics errorStat = statsMap.get(statType + "|error"); DescriptiveStatistics totalStat = statsMap.get(statType + "|total"); statsWriter.write(CSV.format(statType + "%Avg") + CSV.format(correctStat.getMean()) + CSV.format(errorStat.getMean()) + CSV.format(totalStat.getMean()) + CSV.getCsvSeparator()); } // next group statsWriter.write("\n"); for (String groupName : groupNames) { Map<String, DescriptiveStatistics> statsMap = statMap.get(groupName); DescriptiveStatistics correctStat = statsMap.get(statType + "|correct"); DescriptiveStatistics errorStat = statsMap.get(statType + "|error"); DescriptiveStatistics totalStat = statsMap.get(statType + "|total"); statsWriter.write(CSV.format(statType + "%Dev") + CSV.format(correctStat.getStandardDeviation()) + CSV.format(errorStat.getStandardDeviation()) + CSV.format(totalStat.getStandardDeviation()) + CSV.getCsvSeparator()); } // next group statsWriter.write("\n"); statsWriter.flush(); } statsWriter.close(); } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
/** * We attempt to remove specks, where a speck is defined as * a relatively small shape at a relatively large distance from other shapes. * @param sourceImage//from ww w. j a va 2s . co m */ void removeSpecks(SourceImage sourceImage, List<Shape> shapes) { LOG.debug("########## removeSpecks #########"); DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics(); for (Shape shape : shapes) { shapeWidthStats.addValue(shape.getWidth()); shapeHeightStats.addValue(shape.getHeight()); } double shapeWidthMedian = shapeWidthStats.getPercentile(65); double shapeHeightMedian = shapeHeightStats.getPercentile(65); LOG.debug("meanShapeWidth: " + shapeWidthMedian); LOG.debug("meanShapeHeight: " + shapeHeightMedian); int maxSpeckHeightFloor = (int) Math.ceil(shapeHeightMedian / 6.0); int maxSpeckWidthFloor = (int) Math.ceil(shapeWidthMedian / 6.0); int maxSpeckHeightCeiling = maxSpeckHeightFloor * 2; int maxSpeckWidthCeiling = maxSpeckWidthFloor * 2; int speckXDistanceThresholdFloor = (int) Math.floor(shapeWidthMedian); int speckYDistanceThresholdFloor = (int) Math.floor(shapeHeightMedian / 4.0); int speckXDistanceThresholdCeiling = speckXDistanceThresholdFloor * 2; int speckYDistanceThresholdCeiling = speckYDistanceThresholdFloor * 2; LOG.debug("maxSpeckHeightFloor=" + maxSpeckHeightFloor); LOG.debug("maxSpeckWidthFloor=" + maxSpeckWidthFloor); LOG.debug("speckXDistanceThresholdFloor=" + speckXDistanceThresholdFloor); LOG.debug("speckYDistanceThresholdFloor=" + speckYDistanceThresholdFloor); LOG.debug("maxSpeckHeightCeiling=" + maxSpeckHeightCeiling); LOG.debug("maxSpeckWidthCeiling=" + maxSpeckWidthCeiling); LOG.debug("speckXDistanceThresholdCeiling=" + speckXDistanceThresholdCeiling); LOG.debug("speckYDistanceThresholdCeiling=" + speckYDistanceThresholdCeiling); List<Shape> specks = new ArrayList<Shape>(); List<double[]> speckCoordinates = new ArrayList<double[]>(); for (Shape shape : shapes) { if (shape.getHeight() < maxSpeckHeightCeiling && shape.getWidth() < maxSpeckWidthCeiling) { specks.add(shape); speckCoordinates.add(shape.getCentrePoint()); } } // group the specks into clusters, which will be added or removed as a whole // Note that a cluster could be a valid diacritic that's split into a few specks // or just a bunch of specks off on their own DBSCANClusterer<Shape> clusterer = new DBSCANClusterer<Shape>(specks, speckCoordinates); Set<Set<Shape>> speckClusters = clusterer.cluster(speckXDistanceThresholdFloor, 2, true); List<Shape> specksToRemove = new ArrayList<Shape>(); for (Set<Shape> speckCluster : speckClusters) { int speckHeight = 0; int speckWidth = 0; int clusterTop = -1; int clusterBottom = -1; int clusterRight = -1; int clusterLeft = -1; for (Shape speck : speckCluster) { LOG.debug("Speck?, " + speck); if (speck.getWidth() > speckWidth) speckWidth = speck.getWidth(); if (speck.getHeight() > speckHeight) speckHeight = speck.getHeight(); if (clusterTop < 0 || speck.getTop() < clusterTop) clusterTop = speck.getTop(); if (clusterLeft < 0 || speck.getLeft() < clusterLeft) clusterLeft = speck.getLeft(); if (speck.getBottom() > clusterBottom) clusterBottom = speck.getBottom(); if (speck.getRight() > clusterRight) clusterRight = speck.getRight(); } boolean useWidth = speckWidth > speckHeight; double scale = 1.0; if (useWidth) scale = speckWidth < maxSpeckWidthFloor ? 0.0 : (speckWidth > maxSpeckWidthCeiling ? 1.0 : ((double) speckWidth - maxSpeckWidthFloor) / (maxSpeckWidthCeiling - maxSpeckWidthFloor)); else scale = speckHeight < maxSpeckHeightFloor ? 0.0 : (speckHeight > maxSpeckHeightCeiling ? 1.0 : ((double) speckHeight - maxSpeckHeightFloor) / (maxSpeckHeightCeiling - maxSpeckHeightFloor)); int speckXDistanceThreshold = (int) Math.ceil(speckXDistanceThresholdFloor + scale * (speckXDistanceThresholdCeiling - speckXDistanceThresholdFloor)); int speckYDistanceThreshold = (int) Math.ceil(speckYDistanceThresholdFloor + scale * (speckYDistanceThresholdCeiling - speckYDistanceThresholdFloor)); LOG.debug("speckHeight=" + speckHeight); LOG.debug("speckWidth=" + speckWidth); LOG.debug("speckXDistanceThreshold=" + speckXDistanceThreshold); LOG.debug("speckYDistanceThreshold=" + speckYDistanceThreshold); Shape nearestShape = null; double minDistance = 0.0; int nearestShapeXDiff = 0; int nearestShapeYDiff = 0; for (Shape otherShape : shapes) { // limit to nearby shapes if (otherShape.getTop() > clusterBottom + speckYDistanceThreshold + 1) break; if (otherShape.getBottom() < clusterTop - speckYDistanceThreshold - 1) continue; if (otherShape.getRight() < clusterLeft - speckXDistanceThreshold - 1) continue; if (otherShape.getLeft() > clusterRight + speckXDistanceThreshold + 1) continue; // Note: tried !specks.contains(otherShape), but sometimes we have a valid case // where a diacritic is "split" into two specks if (!specks.contains(otherShape)) { int xDiff = 0; int yDiff = 0; int leftDiff = 0; int rightDiff = 0; int topDiff = 0; int botDiff = 0; if (otherShape.getLeft() <= clusterRight && otherShape.getRight() >= clusterLeft) { xDiff = 0; } else { leftDiff = Math.abs(clusterLeft - otherShape.getRight()); rightDiff = Math.abs(clusterRight - otherShape.getLeft()); xDiff = (leftDiff < rightDiff) ? leftDiff : rightDiff; } if (otherShape.getTop() <= clusterBottom && otherShape.getBottom() >= clusterTop) { yDiff = 0; } else { int nearestTop = (otherShape.getTop() > otherShape.getTop() + otherShape.getMeanLine()) ? otherShape.getTop() + otherShape.getMeanLine() : otherShape.getTop(); int nearestBot = (otherShape.getBottom() < otherShape.getTop() + otherShape.getBaseLine()) ? otherShape.getTop() + otherShape.getBaseLine() : otherShape.getBottom(); topDiff = Math.abs(clusterTop - nearestBot); botDiff = Math.abs(clusterBottom - nearestTop); yDiff = (topDiff < botDiff) ? topDiff : botDiff; } double distance = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff)); if (nearestShape == null || distance < minDistance) { nearestShape = otherShape; minDistance = distance; nearestShapeXDiff = xDiff; nearestShapeYDiff = yDiff; LOG.trace("leftDiff=" + leftDiff + ", rightDiff=" + rightDiff); LOG.trace("topDiff=" + topDiff + ", botDiff=" + botDiff); } // found closer shape? } // is this the speck? } // loop shapes around the reference shape if (nearestShape != null) { LOG.trace("Nearest shape, top(" + nearestShape.getTop() + ") " + "left(" + nearestShape.getLeft() + ") " + "bot(" + nearestShape.getBottom() + ") " + "right(" + nearestShape.getRight() + ")"); LOG.trace("Distance=" + minDistance + ", xDiff=" + nearestShapeXDiff + ", yDiff=" + nearestShapeYDiff); } boolean removeSpecks = false; if (nearestShape == null) removeSpecks = true; else { // calculate the shortest distance from the nearest shape to the speck cluster for (Shape speck : speckCluster) { int xDiff = 0; int yDiff = 0; int leftDiff = 0; int rightDiff = 0; int topDiff = 0; int botDiff = 0; if (nearestShape.getLeft() <= speck.getRight() && nearestShape.getRight() >= speck.getLeft()) { xDiff = 0; } else { leftDiff = Math.abs(speck.getLeft() - nearestShape.getRight()); rightDiff = Math.abs(speck.getRight() - nearestShape.getLeft()); xDiff = (leftDiff < rightDiff) ? leftDiff : rightDiff; } if (nearestShape.getTop() <= speck.getBottom() && nearestShape.getBottom() >= speck.getTop()) { yDiff = 0; } else { int nearestTop = (nearestShape.getTop() > nearestShape.getTop() + nearestShape.getMeanLine()) ? nearestShape.getTop() + nearestShape.getMeanLine() : nearestShape.getTop(); int nearestBot = (nearestShape.getBottom() < nearestShape.getTop() + nearestShape.getBaseLine()) ? nearestShape.getTop() + nearestShape.getBaseLine() : nearestShape.getBottom(); topDiff = Math.abs(speck.getTop() - nearestBot); botDiff = Math.abs(speck.getBottom() - nearestTop); yDiff = (topDiff < botDiff) ? topDiff : botDiff; } double distance = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff)); if (distance < minDistance) { minDistance = distance; nearestShapeXDiff = xDiff; nearestShapeYDiff = yDiff; LOG.debug("Found closer speck:"); LOG.debug("leftDiff=" + leftDiff + ", rightDiff=" + rightDiff); LOG.debug("topDiff=" + topDiff + ", botDiff=" + botDiff); } // found closer shape? } // Then, for all of these specks, find the one that's closest to the nearest non-speck // if this distance > threshold, get rid of all of 'em // otherwise, keep 'em all if (nearestShapeXDiff > speckXDistanceThreshold || nearestShapeYDiff > speckYDistanceThreshold) removeSpecks = true; } if (removeSpecks) { for (Shape otherSpeck : speckCluster) { LOG.debug("Removing speck " + otherSpeck); specksToRemove.add(otherSpeck); } } } // next speck shapes.removeAll(specksToRemove); }
From source file:com.facebook.presto.AbstractTestQueries.java
@Test public void testTableSampleBernoulli() throws Exception { DescriptiveStatistics stats = new DescriptiveStatistics(); int total = computeExpected("SELECT orderkey FROM orders", TupleInfo.SINGLE_LONG).getMaterializedTuples() .size();/*from w w w . ja va2 s. c om*/ for (int i = 0; i < 100; i++) { List<MaterializedTuple> values = computeActual("SELECT orderkey FROM ORDERS TABLESAMPLE BERNOULLI (50)") .getMaterializedTuples(); assertEquals(values.size(), ImmutableSet.copyOf(values).size(), "TABLESAMPLE produced duplicate rows"); stats.addValue(values.size() * 1.0 / total); } double mean = stats.getGeometricMean(); assertTrue(mean > 0.45 && mean < 0.55, String.format("Expected mean sampling rate to be ~0.5, but was %s", mean)); }
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
/** * Clear out anything found in the right & left margins * @param sourceImage//from w ww. j a v a2s . c o m */ void cleanMargins(SourceImage sourceImage) { LOG.debug("########## cleanMargins #########"); int minCardinalityForMargin = 8; double averageShapeWidth = sourceImage.getAverageShapeWidth(); LOG.debug("Finding right margin"); double rightLimit = (double) sourceImage.getWidth() * 0.67; // first, create a DBScan cluster of all rows near the right-hand side List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>(); List<double[]> rightCoordinates = new ArrayList<double[]>(); for (RowOfShapes row : sourceImage.getRows()) { double right = row.getRight(); if (right >= rightLimit) { LOG.trace(row.toString()); LOG.trace( "Right: " + right + " + " + row.getXAdjustment() + " = " + (right - row.getXAdjustment())); right -= row.getXAdjustment(); rightHandRows.add(row); rightCoordinates.add(new double[] { right }); } } DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows, rightCoordinates); Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(averageShapeWidth, minCardinalityForMargin, true); TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedRowClusters.addAll(rowClusters); int i = 0; // find the right-most cluster with sufficient cardinality, and assume it's the right margin DescriptiveStatistics rightMarginStats = null; for (Set<RowOfShapes> cluster : orderedRowClusters) { DescriptiveStatistics rightStats = new DescriptiveStatistics(); for (RowOfShapes row : cluster) rightStats.addValue(row.getRight() - row.getXAdjustment()); LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Right mean : " + rightStats.getMean()); LOG.debug("Right std dev: " + rightStats.getStandardDeviation()); if (cluster.size() >= minCardinalityForMargin && (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean())) { rightMarginStats = rightStats; } i++; } // see how many rows would violate this margin - if too many, assume no margin // these rows are only rows which extend across the margin if (rightMarginStats != null) { LOG.debug("Right margin mean : " + rightMarginStats.getMean()); LOG.debug("Right margin std dev: " + rightMarginStats.getStandardDeviation()); double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth(); LOG.debug("rightMarginLimit: " + rightMarginLimit); int numRowsToChop = 0; for (RowOfShapes row : sourceImage.getRows()) { if (row.getRight() >= rightLimit) { if (row.getRight() - row.getXAdjustment() >= rightMarginLimit && row.getLeft() - row.getXAdjustment() <= rightMarginLimit) { LOG.debug("Found overlapping row : " + row); LOG.debug("Adjusted right : " + (row.getRight() - row.getXAdjustment())); numRowsToChop++; } } } if (numRowsToChop >= 3) { LOG.debug("Too many overlapping rows - ignoring margin"); rightMarginStats = null; } } if (rightMarginStats != null) { double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth(); List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>(); for (RowOfShapes row : sourceImage.getRows()) { double right = row.getRight() - row.getXAdjustment(); LOG.trace(row.toString()); LOG.trace("Adjusted right: " + right); if (right >= rightMarginLimit) { LOG.trace("Has out-of-margin stuff!"); // need to chop off groups to the right of this threshold List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>(); for (GroupOfShapes group : row.getGroups()) { if (group.getLeft() - row.getXAdjustment() > rightMarginLimit) { groupsToChop.add(group); LOG.debug("Chopping group outside of right margin: " + group); } } for (GroupOfShapes group : groupsToChop) { row.getShapes().removeAll(group.getShapes()); } row.getGroups().removeAll(groupsToChop); if (row.getGroups().size() == 0) { LOG.debug("Removing empty " + row); rowsToRemove.add(row); } else { row.recalculate(); row.assignGuideLines(); } } // does this row extend beyond the margin? } // next row sourceImage.getRows().removeAll(rowsToRemove); } // have a right margin LOG.debug("Finding left margin"); double leftLimit = (double) sourceImage.getWidth() * 0.33; // first, create a DBScan cluster of all rows near the left-hand side List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>(); List<double[]> leftCoordinates = new ArrayList<double[]>(); for (RowOfShapes row : sourceImage.getRows()) { double left = row.getLeft(); if (left <= leftLimit) { LOG.trace(row.toString()); LOG.trace("Left: " + left + " - " + row.getXAdjustment() + " = " + (left - row.getXAdjustment())); left -= row.getXAdjustment(); leftHandRows.add(row); leftCoordinates.add(new double[] { left }); } } DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows, leftCoordinates); Set<Set<RowOfShapes>> rowClustersLeft = leftMarginClusterer.cluster(averageShapeWidth, minCardinalityForMargin, true); TreeSet<Set<RowOfShapes>> orderedRowClustersLeft = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedRowClustersLeft.addAll(rowClustersLeft); i = 0; // find the left-most cluster with sufficient cardinality, and assume it's the left margin DescriptiveStatistics leftMarginStats = null; for (Set<RowOfShapes> cluster : orderedRowClustersLeft) { DescriptiveStatistics leftStats = new DescriptiveStatistics(); for (RowOfShapes row : cluster) leftStats.addValue(row.getLeft() - row.getXAdjustment()); LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Left mean : " + leftStats.getMean()); LOG.debug("Left std dev: " + leftStats.getStandardDeviation()); if (cluster.size() >= minCardinalityForMargin && (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean())) { leftMarginStats = leftStats; } i++; } // see how many rows would violate this margin - if too many, assume no margin // these rows are only rows which extend across the margin if (leftMarginStats != null) { LOG.debug("Left margin mean : " + leftMarginStats.getMean()); LOG.debug("Left margin std dev: " + leftMarginStats.getStandardDeviation()); double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth(); LOG.debug("leftMarginLimit: " + leftMarginLimit); int numRowsToChop = 0; for (RowOfShapes row : sourceImage.getRows()) { if (row.getLeft() <= leftLimit) { if (row.getLeft() - row.getXAdjustment() <= leftMarginLimit && row.getRight() - row.getXAdjustment() >= leftMarginLimit) { LOG.debug("Found overlapping row : " + row); LOG.debug("Adjusted left : " + (row.getLeft() - row.getXAdjustment())); numRowsToChop++; } } } if (numRowsToChop >= 3) { LOG.debug("Too many overlapping rows - ignoring margin"); leftMarginStats = null; } } if (leftMarginStats != null) { double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth(); List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>(); for (RowOfShapes row : sourceImage.getRows()) { double left = row.getLeft() - row.getXAdjustment(); LOG.trace(row.toString()); LOG.trace("Adjusted left: " + left); if (left <= leftMarginLimit) { LOG.trace("Has out-of-margin stuff!"); // need to chop off groups to the left of this threshold List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>(); for (GroupOfShapes group : row.getGroups()) { if (group.getRight() - row.getXAdjustment() < leftMarginLimit) { groupsToChop.add(group); LOG.debug("Chopping group outside of left margin: " + group); } } for (GroupOfShapes group : groupsToChop) { row.getShapes().removeAll(group.getShapes()); } row.getGroups().removeAll(groupsToChop); if (row.getGroups().size() == 0) { LOG.debug("Removing empty " + row); rowsToRemove.add(row); } else { row.recalculate(); row.assignGuideLines(); } } // does this row extend beyond the margin? } // next row sourceImage.getRows().removeAll(rowsToRemove); } // have a left margin }
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
/** * Detects paragraph splits and assign rows to correct paragraphs. * @param sourceImage//from w w w . jav a 2 s . c o m */ void groupRowsIntoParagraphs(SourceImage sourceImage) { LOG.debug("########## groupRowsIntoParagraphs #########"); // We'll use various possible indicators, including // indented start, indented end, and spacing between rows. // On pages with a single big paragraph makes it hypersensitive to differences in row-start/row-end // This means we cannot use deviation. Instead, we use the average shape width on the page. // We also adjust maxLeft & minRight to match the vertical line slope // This is now complicated by the possibility of multiple columns // Need to take into account a big horizontal space - Pietrushka page 14 // Find horizontal spaces that go all the way across and are wider than a certain threshold // simply do a boolean column and black out everything in a row, than see if there are any remaining spaces above a certain threshold // Columns are thus arranged into "areas", separated by white-space. boolean[] fullRows = new boolean[sourceImage.getHeight()]; for (RowOfShapes row : sourceImage.getRows()) { for (int y = row.getTop(); y <= row.getBottom(); y++) { fullRows[y] = true; } } DescriptiveStatistics rowHeightStats = new DescriptiveStatistics(); for (RowOfShapes row : sourceImage.getRows()) { int height = row.getXHeight(); rowHeightStats.addValue(height); } double avgRowHeight = rowHeightStats.getPercentile(50); LOG.debug("meanRowHeight: " + avgRowHeight); double minHeightForWhiteSpace = avgRowHeight * 1.3; LOG.debug("minHeightForWhiteSpace: " + minHeightForWhiteSpace); // find the "white rows" - any horizontal white space // in the page which is sufficiently high List<int[]> whiteRows = new ArrayList<int[]>(); boolean inWhite = false; int startWhite = 0; for (int y = 0; y < sourceImage.getHeight(); y++) { if (!inWhite && !fullRows[y]) { inWhite = true; startWhite = y; } else if (inWhite && fullRows[y]) { int length = y - startWhite; if (length > minHeightForWhiteSpace) { LOG.debug("Adding whiteRow " + startWhite + "," + (y - 1)); whiteRows.add(new int[] { startWhite, y - 1 }); } inWhite = false; } } if (inWhite) whiteRows.add(new int[] { startWhite, sourceImage.getHeight() - 1 }); whiteRows.add(new int[] { sourceImage.getHeight(), sourceImage.getHeight() }); // place rows in "areas" defined by the "white rows" found above List<List<RowOfShapes>> areas = new ArrayList<List<RowOfShapes>>(); int startY = -1; for (int[] whiteRow : whiteRows) { List<RowOfShapes> area = new ArrayList<RowOfShapes>(); for (RowOfShapes row : sourceImage.getRows()) { if (row.getTop() >= startY && row.getBottom() <= whiteRow[0]) { area.add(row); } } if (area.size() > 0) { areas.add(area); } startY = whiteRow[1]; } // break up each area into vertical columns LOG.debug("break up each area into vertical columns"); List<Column> columns = new ArrayList<Column>(); List<List<Column>> columnsPerAreaList = new ArrayList<List<Column>>(); for (List<RowOfShapes> area : areas) { LOG.debug("Next area"); List<Column> columnsPerArea = new ArrayList<SegmenterImpl.Column>(); columnsPerAreaList.add(columnsPerArea); TreeSet<RowOfShapes> rows = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator()); rows.addAll(area); for (RowOfShapes row : rows) { // try to place this row in one of the columns directly above it. // this means that a row which overlaps more than one column has to "close" this column, so it is no longer considered List<Column> overlappingColumns = new ArrayList<Column>(); for (Column column : columnsPerArea) { if (!column.closed) { RowOfShapes lastRowInColumn = column.get(column.size() - 1); if (row.getRight() - row.getXAdjustment() >= lastRowInColumn.getLeft() - lastRowInColumn.getXAdjustment() && row.getLeft() - row.getXAdjustment() <= lastRowInColumn.getRight() - lastRowInColumn.getXAdjustment()) { overlappingColumns.add(column); } } } if (overlappingColumns.size() == 1) { Column myColumn = overlappingColumns.get(0); RowOfShapes lastRowInMyColumn = myColumn.get(0); // close any columns that are now at a distance of more than one row for (Column column : columnsPerArea) { if (!column.closed && !column.equals(myColumn)) { RowOfShapes lastRowInColumn = column.get(column.size() - 1); if (lastRowInMyColumn.getTop() > lastRowInColumn.getBottom()) { column.closed = true; LOG.debug("Closing distant column " + lastRowInColumn); } } } myColumn.add(row); LOG.debug(row.toString()); LOG.debug(" added to column " + lastRowInMyColumn); } else { for (Column overlappingColumn : overlappingColumns) { overlappingColumn.closed = true; RowOfShapes lastRowInColumn = overlappingColumn.get(overlappingColumn.size() - 1); LOG.debug("Closing overlapping column " + lastRowInColumn); } Column myColumn = new Column(sourceImage); myColumn.add(row); LOG.debug("Found new column"); LOG.debug(row.toString()); columns.add(myColumn); columnsPerArea.add(myColumn); } } } // next area for (Column column : columns) column.recalculate(); // Intermediate step to reform the vertical columns, if they exist // basically the idea is that if the columns are aligned vertically, then the thresholds for paragraph indents // should be shared, to increase the statistical sample size and reduce anomalies. // We'll assume that two columns from two consecutive areas are in the same vertical group if they overlap with each other horizontally // and don't overlap with any other column in the other column's area. List<List<Column>> columnGroups = new ArrayList<List<Column>>(); List<Column> columnsInPrevArea = null; for (List<Column> columnsPerArea : columnsPerAreaList) { if (columnsInPrevArea != null) { for (Column prevColumn : columnsInPrevArea) { LOG.debug("Checking " + prevColumn); // find the column group containing the previous column List<Column> myColumnGroup = null; for (List<Column> columnGroup : columnGroups) { if (columnGroup.contains(prevColumn)) { myColumnGroup = columnGroup; break; } } if (myColumnGroup == null) { myColumnGroup = new ArrayList<SegmenterImpl.Column>(); LOG.debug("Creating column group for column " + prevColumn.toString()); columnGroups.add(myColumnGroup); myColumnGroup.add(prevColumn); } // does only one column overlap with this one? Column overlappingColumn = null; for (Column column : columnsPerArea) { if (column.adjustedRight >= prevColumn.adjustedLeft && column.adjustedLeft <= prevColumn.adjustedRight) { if (overlappingColumn == null) { LOG.debug("I overlap with " + column); overlappingColumn = column; } else { LOG.debug("But I overlap also with " + column); overlappingColumn = null; break; } } } if (overlappingColumn != null) { // does it overlap with only me? for (Column otherPrevColumn : columnsInPrevArea) { if (otherPrevColumn.equals(prevColumn)) continue; if (overlappingColumn.adjustedRight >= otherPrevColumn.adjustedLeft && overlappingColumn.adjustedLeft <= otherPrevColumn.adjustedRight) { LOG.debug("But it overlaps also with " + otherPrevColumn); overlappingColumn = null; break; } } } if (overlappingColumn != null) { myColumnGroup.add(overlappingColumn); LOG.debug("Adding " + overlappingColumn); LOG.debug(" to group with " + prevColumn); } } // next previous column } // have previous columns columnsInPrevArea = columnsPerArea; } // next area if (columnsInPrevArea != null) { for (Column prevColumn : columnsInPrevArea) { // find the column group containing the previous column List<Column> myColumnGroup = null; for (List<Column> columnGroup : columnGroups) { if (columnGroup.contains(prevColumn)) { myColumnGroup = columnGroup; break; } } if (myColumnGroup == null) { myColumnGroup = new ArrayList<SegmenterImpl.Column>(); LOG.debug("Creating column group for column " + prevColumn.toString()); columnGroups.add(myColumnGroup); myColumnGroup.add(prevColumn); } } } // What we really want here is, for each column (in the case of right-to-left), // two clusters on the right // and one relatively big cluster on the left. // anything outside of the cluster on the left is an EOP. boolean hasTab = false; for (List<Column> columnGroup : columnGroups) { LOG.debug("Next column group"); double averageShapeWidth = sourceImage.getAverageShapeWidth(); LOG.debug("averageShapeWidth: " + averageShapeWidth); double epsilon = averageShapeWidth / 2.0; LOG.debug("epsilon: " + epsilon); int columnGroupTop = sourceImage.getHeight(); int columnGroupBottom = 0; int columnGroupLeft = sourceImage.getWidth(); int columnGroupRight = 0; for (Column column : columnGroup) { if (column.top < columnGroupTop) columnGroupTop = (int) Math.round(column.top); if (column.bottom > columnGroupBottom) columnGroupBottom = (int) Math.round(column.bottom); if (column.adjustedLeft < columnGroupLeft) columnGroupLeft = (int) Math.round(column.adjustedLeft); if (column.adjustedRight > columnGroupRight) columnGroupRight = (int) Math.round(column.adjustedRight); } // right thresholds LOG.debug("Calculating right thresholds"); // first, create a DBScan cluster of all rows by their adjusted right coordinate List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>(); List<double[]> rightCoordinates = new ArrayList<double[]>(); for (Column column : columnGroup) { for (RowOfShapes row : column) { double right = row.getRight() - row.getXAdjustment(); // double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage); // if (rightOverlap==0) { // // leave out any right-overlapping rows here // // since we need accurate statistics for margin detection // // This is questionable - especially since a long vertical bar (see Petriushka) // // tends to give all rows a left overlap. Also, because the overlap is calculated based // // on the mean right & mean left, not based on any sort of margin clusters. // rightHandRows.add(row); // rightCoordinates.add(new double[] {right}); // } rightHandRows.add(row); rightCoordinates.add(new double[] { right }); } } int minCardinalityForRightMargin = 5; DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows, rightCoordinates); Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(epsilon, minCardinalityForRightMargin, true); TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedRowClusters.addAll(rowClusters); int i = 0; // find the two right-most clusters, and assume they are the margin & the tab DescriptiveStatistics rightMarginStats = null; DescriptiveStatistics rightTabStats = null; for (Set<RowOfShapes> cluster : orderedRowClusters) { DescriptiveStatistics rightStats = new DescriptiveStatistics(); MeanAbsoluteDeviation rightDev = new MeanAbsoluteDeviation(); for (RowOfShapes row : cluster) { int rowIndex = rightHandRows.indexOf(row); double right = rightCoordinates.get(rowIndex)[0]; rightStats.addValue(right); rightDev.increment(right); } LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Right mean : " + rightStats.getMean()); LOG.debug("Right dev: " + rightDev.getResult()); if (cluster.size() >= minCardinalityForRightMargin) { if (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean()) { if (rightMarginStats != null) rightTabStats = rightMarginStats; rightMarginStats = rightStats; } else if (rightTabStats == null || rightTabStats.getMean() < rightStats.getMean()) { rightTabStats = rightStats; } } else { break; } i++; } // next right-coordinate cluster double rightMargin = sourceImage.getWidth(); double rightTab = sourceImage.getWidth(); if (rightMarginStats != null) { rightMargin = rightMarginStats.getMean(); } else { List<Rectangle> columnSeparators = sourceImage.findColumnSeparators(); for (Rectangle columnSeparator : columnSeparators) { if (columnSeparator.getTop() <= columnGroupTop && columnSeparator.getBottom() >= columnGroupBottom && columnSeparator.getLeft() >= columnGroupRight) { if (columnSeparator.getLeft() < rightMargin) rightMargin = columnSeparator.getLeft(); } } } if (rightTabStats != null) { rightTab = rightTabStats.getMean(); } LOG.debug("rightMargin: " + rightMargin); LOG.debug("rightTab: " + rightTab); // left thresholds LOG.debug("Calculating left thresholds"); // first, create a DBScan cluster of all rows by their adjusted left coordinate List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>(); List<double[]> leftCoordinates = new ArrayList<double[]>(); for (Column column : columnGroup) { for (RowOfShapes row : column) { double left = row.getLeft() - row.getXAdjustment(); // double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage); // if (leftOverlap == 0) { // // leave out any overlapping rows from margin calcs, // // since we need accurate statistics here // leftHandRows.add(row); // leftCoordinates.add(new double[] {left}); // } leftHandRows.add(row); leftCoordinates.add(new double[] { left }); } } int minCardinalityForLeftMargin = 5; DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows, leftCoordinates); Set<Set<RowOfShapes>> leftRowClusters = leftMarginClusterer.cluster(epsilon, minCardinalityForLeftMargin, true); TreeSet<Set<RowOfShapes>> orderedLeftRowClusters = new TreeSet<Set<RowOfShapes>>( new CardinalityComparator<RowOfShapes>()); orderedLeftRowClusters.addAll(leftRowClusters); i = 0; // find the two left-most clusters, and assume they are the margin & the tab DescriptiveStatistics leftMarginStats = null; DescriptiveStatistics leftTabStats = null; for (Set<RowOfShapes> cluster : orderedLeftRowClusters) { DescriptiveStatistics leftStats = new DescriptiveStatistics(); MeanAbsoluteDeviation leftDev = new MeanAbsoluteDeviation(); for (RowOfShapes row : cluster) { int rowIndex = leftHandRows.indexOf(row); double left = leftCoordinates.get(rowIndex)[0]; leftStats.addValue(left); leftDev.increment(left); } LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size()); LOG.debug("Left mean : " + leftStats.getMean()); LOG.debug("Left dev: " + leftDev.getResult()); if (cluster.size() >= minCardinalityForLeftMargin) { if (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean()) { if (leftMarginStats != null) leftTabStats = leftMarginStats; leftMarginStats = leftStats; } else if (leftTabStats == null || leftTabStats.getMean() > leftStats.getMean()) { leftTabStats = leftStats; } } else { break; } i++; } // next left-coordinate cluster double leftMargin = 0; double leftTab = 0; if (leftMarginStats != null) { leftMargin = leftMarginStats.getMean(); } else { List<Rectangle> columnSeparators = sourceImage.findColumnSeparators(); for (Rectangle columnSeparator : columnSeparators) { if (columnSeparator.getTop() <= columnGroupTop && columnSeparator.getBottom() >= columnGroupBottom && columnSeparator.getRight() <= columnGroupLeft) { if (columnSeparator.getRight() > leftMargin) leftMargin = columnSeparator.getRight(); } } } if (leftTabStats != null) { leftTab = leftTabStats.getMean(); } LOG.debug("leftMargin: " + leftMargin); LOG.debug("leftTab: " + leftTab); for (Column column : columnGroup) { if (sourceImage.isLeftToRight()) { column.startMargin = leftMargin; if (leftTabStats != null) { column.startTab = leftTab; column.hasTab = true; } else { LOG.debug("No left tab - setting based on left margin"); column.startTab = leftMargin + (5.0 * sourceImage.getAverageShapeWidth()); column.hasTab = false; } column.endMargin = rightMargin; } else { column.startMargin = rightMargin; if (rightTabStats != null) { column.startTab = rightTab; column.hasTab = true; } else { LOG.debug("No right tab - setting based on right margin"); column.startTab = rightMargin - (5.0 * sourceImage.getAverageShapeWidth()); column.hasTab = false; } column.endMargin = leftMargin; } LOG.debug("Margins for " + column); LOG.debug("startMargin: " + column.startMargin); LOG.debug("startTab: " + column.startTab); LOG.debug("endMargin: " + column.endMargin); } // next column } // next column group LOG.debug("hasTab: " + hasTab); double safetyMargin = 1.5 * sourceImage.getAverageShapeWidth(); // Now, paragraphs are either "indented", "outdented" or not "dented" at all (no tabs). // This applies to the entire page. // To recognise indenting vs. outdenting, we have to see if the row preceding each // indent/outdent is full or partial. In the case of indentation, partial rows will // typically be followed by an indent. In the case of outdentation, partial rows will // typically be followed by an outdent. boolean isIndented = true; int indentCount = 0; int outdentCount = 0; for (List<Column> columnGroup : columnGroups) { LOG.debug("Next column group"); boolean prevRowPartial = false; for (Column column : columnGroup) { if (column.hasTab) { for (RowOfShapes row : column) { if (sourceImage.isLeftToRight()) { if (prevRowPartial) { if (row.getLeft() - row.getXAdjustment() > column.startTab - safetyMargin) { indentCount++; } else if (row.getLeft() - row.getXAdjustment() < column.startMargin + safetyMargin) { outdentCount++; } } if (row.getRight() - row.getXAdjustment() < column.endMargin - safetyMargin) { prevRowPartial = true; } else { prevRowPartial = false; } } else { if (prevRowPartial) { if (row.getRight() - row.getXAdjustment() < column.startTab + safetyMargin) { indentCount++; } else if (row.getRight() - row.getXAdjustment() > column.startMargin - safetyMargin) { outdentCount++; } } if (row.getLeft() - row.getXAdjustment() > column.endMargin + safetyMargin) { prevRowPartial = true; } else { prevRowPartial = false; } } // left-to-right? } // next row } // column has tab } // next column } // next column group isIndented = (indentCount + 2 >= outdentCount); LOG.debug("indentCount: " + indentCount); LOG.debug("outdentCount: " + outdentCount); LOG.debug("isIndented: " + isIndented); // order the columns TreeSet<Column> orderedColumns = new TreeSet<SegmenterImpl.Column>(columns); columns.clear(); columns.addAll(orderedColumns); // find the paragraphs found in each column for (Column column : columns) { LOG.debug("--- Next column ---"); // break up the column into paragraphs Paragraph paragraph = null; RowOfShapes previousRow = null; int maxShapesForStandaloneParagraph = 2; List<RowOfShapes> rowsForStandaloneParagraphs = new ArrayList<RowOfShapes>(); Point2D previousPointStartMargin = null; Point2D previousPointStartTab = null; Point2D previousPointEndMargin = null; for (RowOfShapes row : column) { boolean rowForStandaloneParagraph = false; boolean newParagraph = false; if (row.getShapes().size() <= maxShapesForStandaloneParagraph) { rowsForStandaloneParagraphs.add(row); rowForStandaloneParagraph = true; } else { double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage); double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage); if (drawSegmentation) { double rowVerticalMidPoint = row.getBaseLineMiddlePoint(); double startMarginX = column.startMargin + row.getXAdjustment(); double startTabX = column.startTab + row.getXAdjustment(); double endMarginX = column.endMargin + row.getXAdjustment(); if (sourceImage.isLeftToRight()) { startMarginX += safetyMargin; startTabX -= safetyMargin; endMarginX -= safetyMargin; startMarginX += leftOverlap; startTabX += leftOverlap; endMarginX -= rightOverlap; } else { startMarginX -= safetyMargin; startTabX += safetyMargin; endMarginX += safetyMargin; startMarginX -= rightOverlap; startTabX -= rightOverlap; endMarginX += leftOverlap; } Point2D.Double currentPointStartMargin = new Point2D.Double(startMarginX, rowVerticalMidPoint); Point2D.Double currentPointStartTab = new Point2D.Double(startTabX, rowVerticalMidPoint); Point2D.Double currentPointEndMargin = new Point2D.Double(endMarginX, rowVerticalMidPoint); if (previousPointStartMargin != null) { graphics2D.setStroke(new BasicStroke(1)); graphics2D.setPaint(Color.BLUE); graphics2D.drawLine((int) Math.round(previousPointStartMargin.getX()), (int) Math.round(previousPointStartMargin.getY()), (int) Math.round(currentPointStartMargin.getX()), (int) Math.round(currentPointStartMargin.getY())); graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()), (int) Math.round(previousPointEndMargin.getY()), (int) Math.round(currentPointEndMargin.getX()), (int) Math.round(currentPointEndMargin.getY())); graphics2D.setPaint(Color.RED); graphics2D.drawLine((int) Math.round(previousPointStartTab.getX()), (int) Math.round(previousPointStartTab.getY()), (int) Math.round(currentPointStartTab.getX()), (int) Math.round(currentPointStartTab.getY())); graphics2D.setPaint(Color.RED); graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()), (int) Math.round(previousPointEndMargin.getY()), (int) Math.round(currentPointEndMargin.getX()), (int) Math.round(currentPointEndMargin.getY())); } previousPointStartMargin = currentPointStartMargin; previousPointStartTab = currentPointStartTab; previousPointEndMargin = currentPointEndMargin; } if (previousRow == null) { LOG.debug("New paragraph (first)"); newParagraph = true; } else { if (sourceImage.isLeftToRight()) { if (previousRow.getRight() - previousRow.getXAdjustment() - rightOverlap < column.endMargin - safetyMargin) { LOG.debug("New paragraph (previous EOP)"); newParagraph = true; } else if (column.hasTab && isIndented && row.getLeft() - row.getXAdjustment() + leftOverlap > column.startTab - safetyMargin) { LOG.debug("New paragraph (indent)"); newParagraph = true; } else if (column.hasTab && !isIndented && row.getLeft() - row.getXAdjustment() + leftOverlap < column.startMargin + safetyMargin) { LOG.debug("New paragraph (outdent)"); newParagraph = true; } } else { if (previousRow.getLeft() - previousRow.getXAdjustment() + leftOverlap > column.endMargin + safetyMargin) { LOG.debug("New paragraph (previous EOP)"); newParagraph = true; } else if (column.hasTab && isIndented && row.getRight() - row.getXAdjustment() - rightOverlap < column.startTab + safetyMargin) { LOG.debug("New paragraph (indent)"); newParagraph = true; } else if (column.hasTab && !isIndented && row.getRight() - row.getXAdjustment() - rightOverlap > column.startMargin - safetyMargin) { LOG.debug("New paragraph (outdent)"); newParagraph = true; } } // left-to-right? } // have previous row } // standalone paragraph? if (!rowForStandaloneParagraph) LOG.debug(row.toString()); if (newParagraph) { if (rowsForStandaloneParagraphs.size() > 0) { for (RowOfShapes oneRow : rowsForStandaloneParagraphs) { LOG.debug("Standalone paragraph"); LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")"); Paragraph standaloneParagraph = sourceImage.newParagraph(); standaloneParagraph.getRows().add(oneRow); } rowsForStandaloneParagraphs.clear(); } paragraph = sourceImage.newParagraph(); } //LOG.debug("Row: left(" + row.getLeft() + "), right(" + row.getRight() + "), width(" + (row.getRight() - row.getLeft() + 1) + ")"); if (!rowForStandaloneParagraph) { paragraph.getRows().add(row); previousRow = row; } } // next row in column if (rowsForStandaloneParagraphs.size() > 0) { for (RowOfShapes oneRow : rowsForStandaloneParagraphs) { LOG.debug("Standalone paragraph"); LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")"); Paragraph standaloneParagraph = sourceImage.newParagraph(); standaloneParagraph.getRows().add(oneRow); } rowsForStandaloneParagraphs.clear(); } } // next column }
From source file:org.apache.eagle.service.jpm.suggestion.AbstractGCFunc.java
private double getGcRatio(List<TaskExecutionAPIEntity> tasks) { if (tasks.isEmpty()) { return 0; }/* w ww . jav a 2 s. c o m*/ double[] gcMs = ResourceUtils.getCounterValues(tasks, JobCounters.CounterName.GC_MILLISECONDS); double[] cpuMs = ResourceUtils.getCounterValues(tasks, JobCounters.CounterName.CPU_MILLISECONDS); DescriptiveStatistics statistics = new DescriptiveStatistics(); double averageCpuMs = statistics.getMeanImpl().evaluate(cpuMs); double averageGcMs = statistics.getMeanImpl().evaluate(gcMs); if (averageCpuMs == 0) { averageCpuMs = 1; } return averageGcMs / averageCpuMs; }
From source file:org.apache.eagle.service.jpm.suggestion.AbstractInputFunc.java
@Override public JobSuggestionResponse apply(TaskGroupResponse data) { MRTaskExecutionResponse.TaskGroup taskGroup = getTasks(data); double[] smallerGroup = ResourceUtils.getCounterValues(taskGroup.shortTasks, counterName); double[] largerGroup = ResourceUtils.getCounterValues(taskGroup.longTasks, counterName); DescriptiveStatistics statistics = new DescriptiveStatistics(); double avgSmaller = statistics.getMeanImpl().evaluate(smallerGroup); double avgLarger = statistics.getMeanImpl().evaluate(largerGroup); List<MRTaskExecutionResponse.SuggestionResult> suggestionResults = getDeviationSuggest(avgSmaller, avgLarger);//from w w w . j a va 2s.co m MRTaskExecutionResponse.JobSuggestionResponse response = new MRTaskExecutionResponse.JobSuggestionResponse(); response.suggestionResults = suggestionResults; response.suggestionType = suggestType.toString(); return response; }
From source file:org.apache.jackrabbit.performance.AbstractPerformanceTest.java
private DescriptiveStatistics runTest(AbstractTest test, Repository repository) throws Exception { DescriptiveStatistics statistics = new DescriptiveStatistics(); test.setUp(repository, credentials); try {// w w w. jav a2s . co m // Run a few iterations to warm up the system long warmupEnd = System.currentTimeMillis() + warmup * 1000; while (System.currentTimeMillis() < warmupEnd) { test.execute(); } // Run test iterations, and capture the execution times long runtimeEnd = System.currentTimeMillis() + runtime * 1000; while (System.currentTimeMillis() < runtimeEnd) { statistics.addValue(test.execute()); } } finally { test.tearDown(); } return statistics; }
From source file:org.apache.sling.junit.performance.listener.StatisticsListener.java
@Override public void executionStarted(String className, String testName) throws Exception { statistics = new DescriptiveStatistics(); }