List of usage examples for org.apache.commons.math.stat.descriptive.moment StandardDeviation increment
@Override public void increment(final double d)
From source file:com.joliciel.jochre.graphics.SourceImageImpl.java
@Override public Set<Set<RowOfShapes>> getRowClusters() { if (rowClusters == null) { Mean heightMean = new Mean(); StandardDeviation heightStdDev = new StandardDeviation(); List<double[]> rowHeights = new ArrayList<double[]>(this.getRows().size()); for (RowOfShapes row : this.getRows()) { Shape shape = row.getShapes().iterator().next(); int height = shape.getBaseLine() - shape.getMeanLine(); rowHeights.add(new double[] { height }); heightMean.increment(height); heightStdDev.increment(height); }//w w w . java 2 s. co m double stdDevHeight = heightStdDev.getResult(); List<RowOfShapes> rows = new ArrayList<RowOfShapes>(this.getRows()); DBSCANClusterer<RowOfShapes> clusterer = new DBSCANClusterer<RowOfShapes>(rows, rowHeights); rowClusters = clusterer.cluster(stdDevHeight, 2, true); LOG.debug("Found " + rowClusters.size() + " row clusters."); } return rowClusters; }
From source file:com.joliciel.jochre.graphics.SourceImageImpl.java
@Override public double getMeanHorizontalSlope() { if (!meanHorizontalSlopeCalculated) { // Calculate the average regression to be used for analysis Mean meanForSlope = new Mean(); StandardDeviation stdDevForSlope = new StandardDeviation(); List<SimpleRegression> regressions = new ArrayList<SimpleRegression>(); for (RowOfShapes row : this.getRows()) { SimpleRegression regression = row.getRegression(); // only include rows for which regression was really calculated (more than 2 points) if (regression.getN() > 2) { meanForSlope.increment(regression.getSlope()); stdDevForSlope.increment(regression.getSlope()); regressions.add(regression); }// w ww . j av a2 s . c o m } double slopeMean = 0.0; double slopeStdDev = 0.0; if (meanForSlope.getN() > 0) { slopeMean = meanForSlope.getResult(); slopeStdDev = stdDevForSlope.getResult(); } LOG.debug("slopeMean: " + slopeMean); LOG.debug("slopeStdDev: " + slopeStdDev); if (regressions.size() > 0) { double minSlope = slopeMean - slopeStdDev; double maxSlope = slopeMean + slopeStdDev; meanForSlope = new Mean(); for (SimpleRegression regression : regressions) { if (minSlope <= regression.getSlope() && regression.getSlope() <= maxSlope) meanForSlope.increment(regression.getSlope()); } meanHorizontalSlope = meanForSlope.getResult(); } else { meanHorizontalSlope = 0.0; } LOG.debug("meanHorizontalSlope: " + meanHorizontalSlope); meanHorizontalSlopeCalculated = true; } return meanHorizontalSlope; }
From source file:com.joliciel.jochre.graphics.RowOfShapesImpl.java
/** * Assign guidelines for a certain subset of shapes, and return the x-height. * @param startShape/*from w w w. j a v a 2 s . co m*/ * @param endShape * @return */ int assignGuideLines(List<GroupOfShapes> groupsToAssign) { LOG.debug("assignGuideLines internal"); double meanHorizontalSlope = this.getContainer().getMeanHorizontalSlope(); // the base-line and mean-line will be at a fixed distance away from the midpoint // the question is, which distance! // To find this out, we count number of black pixels on each row above this line // And then start analysing from the top and the bottom until the number drops off sharply // The notion of "groupsToAssign" is used to only assign guidelines // to a subset of the groups on the line // when the line contains two different font sizes List<Shape> shapes = new ArrayList<Shape>(); if (groupsToAssign != null) { for (GroupOfShapes group : groupsToAssign) { shapes.addAll(group.getShapes()); } } else { shapes = this.getShapes(); } int i = 0; DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics(); for (Shape shape : this.getShapes()) { shapeWidthStats.addValue(shape.getWidth()); shapeHeightStats.addValue(shape.getHeight()); } double minWidth = shapeWidthStats.getPercentile(25); double maxWidth = shapeWidthStats.getPercentile(75); double minHeight = shapeHeightStats.getPercentile(45); double maxHeight = shapeHeightStats.getPercentile(75); double rowMidPointX = (double) (this.getLeft() + this.getRight()) / 2.0; // calculating the Y midpoint by the shapes in the row, instead of by the top & bottom of row Mean rowMidPointYMean = new Mean(); for (Shape shape : this.getShapes()) { // only add points whose shape is of "average" width and height (to leave out commas, etc.) if (shape.getWidth() >= minWidth && shape.getWidth() <= maxWidth && shape.getHeight() >= minHeight && shape.getHeight() <= maxHeight) { rowMidPointYMean.increment((double) (shape.getBottom() + shape.getTop()) / 2.0); } } double rowMidPointY = (double) (this.getTop() + this.getBottom()) / 2.0; if (rowMidPointYMean.getN() > 0) rowMidPointY = rowMidPointYMean.getResult(); LOG.debug("rowMidPointX: " + rowMidPointX); LOG.debug("rowMidPointY: " + rowMidPointY); // figure out where the top-most shape starts and the bottom-most shape ends, relative to the y midline int minTop = Integer.MAX_VALUE; int maxBottom = Integer.MIN_VALUE; List<Integer> rowYMidPoints = new ArrayList<Integer>(shapes.size()); for (Shape shape : shapes) { double shapeMidPointX = (double) (shape.getLeft() + shape.getRight()) / 2.0; int shapeMidPointY = (int) Math .round(rowMidPointY + (meanHorizontalSlope * (shapeMidPointX - rowMidPointX))); rowYMidPoints.add(shapeMidPointY); int relativeTop = shape.getTop() - shapeMidPointY; int relativeBottom = shape.getBottom() - shapeMidPointY; if (relativeTop < minTop) minTop = relativeTop; if (relativeBottom > maxBottom) maxBottom = relativeBottom; } if (minTop > 0) minTop = 0; if (maxBottom < 0) maxBottom = 0; int yIntervalTop = 0 - minTop; int yIntervalBottom = maxBottom; int yInterval = yIntervalTop + 1 + yIntervalBottom; LOG.debug("yIntervalTop: " + yIntervalTop); LOG.debug("yIntervalBottom: " + yIntervalBottom); LOG.debug("yInterval: " + yInterval); int[] pixelCounts = new int[yInterval]; // Get the pixel count for each row // examining one shape at a time to limit ourselves to the pixels that are // actually considered to be in this row int blackThreshold = this.getContainer().getSeparationThreshold(); int shapeIndex = 0; int shapeCount = 0; for (Shape shape : shapes) { if (shape.getHeight() >= minHeight) { LOG.trace(shape.toString()); shapeCount++; int shapeMidPointY = rowYMidPoints.get(shapeIndex); int zeroLine = shapeMidPointY - yIntervalTop; int topIndex = shape.getTop() - zeroLine; for (int x = 0; x < shape.getWidth(); x++) { for (int y = 0; y < shape.getHeight(); y++) { int yIndex = topIndex + y; if (yIndex >= 0 && yIndex < pixelCounts.length && shape.isPixelBlack(x, y, blackThreshold)) { pixelCounts[yIndex]++; } } } } shapeIndex++; } LOG.debug("Got pixels from " + shapeCount + " shapes."); boolean notEnoughShapes = shapeCount < 3; LOG.debug("notEnoughShapes? " + notEnoughShapes); // We start at the top // As soon as we reach a line with more pixels than the mean, we assume this is the mean-line Mean pixelCountMeanTop = new Mean(); StandardDeviation pixelCountStdDevTop = new StandardDeviation(); for (i = 0; i <= yIntervalTop; i++) { pixelCountMeanTop.increment(pixelCounts[i]); pixelCountStdDevTop.increment(pixelCounts[i]); } LOG.debug("Top: pixel count mean: " + pixelCountMeanTop.getResult() + ", std dev: " + pixelCountStdDevTop.getResult()); double threshold = pixelCountMeanTop.getResult() * 1.1; if (notEnoughShapes) { threshold = threshold / 2.0; } double lowerThreshold = threshold / 2.0; LOG.debug("Top threshold: " + threshold); LOG.debug("Top lowerThreshold: " + lowerThreshold); int meanLine = 0; boolean findMeanLine = true; for (i = 0; i <= yIntervalTop; i++) { int pixelCount = pixelCounts[i]; if (findMeanLine && pixelCount > threshold) { meanLine = i; findMeanLine = false; } else if (!findMeanLine && pixelCount < lowerThreshold) { findMeanLine = true; } } // We start at the bottom // As soon as we reach a line with more pixels than the mean, we assume this is the base-line Mean pixelCountMeanBottom = new Mean(); StandardDeviation pixelCountStdDevBottom = new StandardDeviation(); for (i = pixelCounts.length - 1; i >= yIntervalTop; i--) { pixelCountMeanBottom.increment(pixelCounts[i]); pixelCountStdDevBottom.increment(pixelCounts[i]); } LOG.debug("Bottom: pixel count mean: " + pixelCountMeanBottom.getResult() + ", std dev: " + pixelCountStdDevBottom.getResult()); threshold = pixelCountMeanBottom.getResult() * 1.1; if (notEnoughShapes) { threshold = threshold / 2.0; } lowerThreshold = threshold / 2.0; LOG.debug("Bottom threshold: " + threshold); LOG.debug("Bottom lowerThreshold: " + lowerThreshold); int baseLine = meanLine; boolean findBaseLine = true; for (i = pixelCounts.length - 1; i >= yIntervalTop; i--) { int pixelCount = pixelCounts[i]; if (findBaseLine && pixelCount > threshold) { baseLine = i; findBaseLine = false; } else if (!findBaseLine && pixelCount < lowerThreshold) { findBaseLine = true; } } for (i = 0; i < yInterval; i++) { int pixelCount = pixelCounts[i]; if (i == meanLine) LOG.trace("======= MEAN LINE " + i + " =========="); LOG.trace("pixel row " + i + ". pixel count " + pixelCount); if (i == baseLine) LOG.trace("======= BASE LINE " + i + " =========="); } // assign base lines and mean lines to each shape shapeIndex = 0; for (Shape shape : shapes) { int shapeMidPointY = rowYMidPoints.get(shapeIndex); int yMeanline = (shapeMidPointY - yIntervalTop) + meanLine; int yBaseline = (shapeMidPointY - yIntervalTop) + baseLine; LOG.trace(shape.toString() + ", meanLine: " + (yMeanline - shape.getTop()) + ", baseLine: " + (yBaseline - shape.getTop())); shape.setBaseLine(yBaseline - shape.getTop()); shape.setMeanLine(yMeanline - shape.getTop()); shapeIndex++; } // next shape int xHeight = baseLine - meanLine; return xHeight; }
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
/** * Split rows if they're particularly high, and contain considerable white space in the middle. * Shapes causing the join will be removed if too high, or attached to the closest row otherwise. * @param sourceImage//from w ww. j a va 2 s . co m * @param regressions * @return */ void splitRows(SourceImage sourceImage) { LOG.debug("########## splitRows #########"); // Calculate the min row height to be considered for splitting double minHeightForSplit = sourceImage.getAverageShapeHeight(); LOG.debug("minHeightForSplit: " + minHeightForSplit); double slopeMean = sourceImage.getMeanHorizontalSlope(); List<RowOfShapes> candidateRows = new ArrayList<RowOfShapes>(); for (RowOfShapes row : sourceImage.getRows()) { if (row.getRight() == row.getLeft()) continue; int height = row.getBottom() - row.getTop(); if (height >= minHeightForSplit) { LOG.debug("Adding candidate " + row.toString()); candidateRows.add(row); } } // For each row to be considered for splitting, see if there are lines of white space inside it. Hashtable<RowOfShapes, List<RowOfShapes>> splitRows = new Hashtable<RowOfShapes, List<RowOfShapes>>(); for (RowOfShapes row : candidateRows) { SimpleRegression regression = new SimpleRegression(); // y = intercept + slope * x LOG.debug("Left point: (" + row.getLeft() + " , " + row.getTop() + ")"); regression.addData(row.getLeft(), row.getTop()); double rightHandY = row.getTop() + ((double) (row.getRight() - row.getLeft()) * slopeMean); LOG.debug("Right point: (" + row.getRight() + " , " + rightHandY + ")"); regression.addData(row.getRight(), rightHandY); int yDelta = (int) Math.ceil(Math.abs(rightHandY - (double) row.getTop())); int yInterval = yDelta + (row.getBottom() - row.getTop() + 1) + yDelta; LOG.debug("yDelta: " + yDelta); LOG.debug("yInterval: " + yInterval); // let's get pixel counts shape by shape, and leave out the rest (in case rows overlap vertically) int[] pixelCounts = new int[yInterval]; for (Shape shape : row.getShapes()) { LOG.trace("Shape " + shape); int yDeltaAtLeft = (int) Math.round(regression.predict(shape.getLeft())); LOG.trace("yDeltaAtLeft: " + yDeltaAtLeft); // the shape offset + the offset between the regression line and the row top // + the delta we left at the start in case the line slopes upwards to the right int topIndex = (shape.getTop() - row.getTop()) + (row.getTop() - yDeltaAtLeft) + yDelta; LOG.trace("topIndex: (" + shape.getTop() + " - " + row.getTop() + ") + (" + row.getTop() + " - " + yDeltaAtLeft + ") + " + yDelta + " = " + topIndex); for (int x = 0; x < shape.getWidth(); x++) { for (int y = 0; y < shape.getHeight(); y++) { if (shape.isPixelBlack(x, y, sourceImage.getBlackThreshold())) { pixelCounts[topIndex + y]++; } } } } Mean pixelCountMean = new Mean(); StandardDeviation pixelCountStdDev = new StandardDeviation(); for (int i = 0; i < yInterval; i++) { LOG.debug("Pixel count " + i + ": " + pixelCounts[i]); pixelCountMean.increment(pixelCounts[i]); pixelCountStdDev.increment(pixelCounts[i]); } LOG.debug("pixel count mean: " + pixelCountMean.getResult() + ", std dev: " + pixelCountStdDev.getResult()); // If there's a split required, we're going to go considerably above and below the mean several times double lowThreshold = pixelCountMean.getResult() / 2.0; double highThreshold = pixelCountMean.getResult() * 2.0; boolean inRow = false; List<Integer> switches = new ArrayList<Integer>(); for (int i = 0; i < yInterval; i++) { if (!inRow && pixelCounts[i] > highThreshold) { LOG.debug("In row at " + i + ", pixel count " + pixelCounts[i]); inRow = true; switches.add(i); } else if (inRow && pixelCounts[i] < lowThreshold) { LOG.debug("Out of row at " + i + ", pixel count " + pixelCounts[i]); inRow = false; switches.add(i); } } if (switches.size() > 2) { // we have more than one row List<Integer> rowSeparations = new ArrayList<Integer>(); // find the row separators for (int switchIndex = 1; switchIndex < switches.size() - 2; switchIndex = switchIndex + 2) { int outOfRow = switches.get(switchIndex); int intoRow = switches.get(switchIndex + 1); int minPixelCount = (int) Math.ceil(highThreshold); int minIndex = -1; // find the row with the lowest pixel count for (int i = outOfRow; i <= intoRow; i++) { if (pixelCounts[i] < minPixelCount) { minPixelCount = pixelCounts[i]; minIndex = i; } } rowSeparations.add(minIndex); } // separate the shapes among the rows List<RowOfShapes> newRows = new ArrayList<RowOfShapes>(rowSeparations.size() + 1); for (int i = 0; i <= rowSeparations.size(); i++) { newRows.add(graphicsService.getEmptyRow(sourceImage)); } // add a separator at the beginning and end rowSeparations.add(0, 0); rowSeparations.add(yInterval + 1); for (Shape shape : row.getShapes()) { int yDeltaAtLeft = (int) Math.round(regression.predict(shape.getLeft())); int topIndex = (shape.getTop() - row.getTop()) + (row.getTop() - yDeltaAtLeft) + yDelta; int firstSepAfterShapeBottom = rowSeparations.size(); int lastSepBeforeShapeTop = -1; for (int i = rowSeparations.size() - 1; i >= 0; i--) { int rowSeparation = rowSeparations.get(i); if (rowSeparation <= topIndex) { lastSepBeforeShapeTop = i; break; } } for (int i = 0; i < rowSeparations.size(); i++) { int rowSeparation = rowSeparations.get(i); if (rowSeparation >= topIndex + shape.getHeight()) { firstSepAfterShapeBottom = i; break; } } if (lastSepBeforeShapeTop == firstSepAfterShapeBottom - 1) { // shape clearly belongs to one row RowOfShapes newRow = newRows.get(lastSepBeforeShapeTop); newRow.addShape(shape); } else { // is the shape much closer to one row than another? // if yes, add it to then add it to this row int[] yPixelsPerRow = new int[newRows.size()]; for (int i = 0; i < newRows.size(); i++) { int separatorTop = rowSeparations.get(i); int separatorBottom = rowSeparations.get(i + 1); int top = topIndex < separatorTop ? separatorTop : topIndex; int bottom = topIndex + shape.getHeight() < separatorBottom ? topIndex + shape.getHeight() : separatorBottom; yPixelsPerRow[i] = bottom - top; } int pixelsInMaxRow = 0; int maxPixelRowIndex = -1; for (int i = 0; i < newRows.size(); i++) { if (yPixelsPerRow[i] > pixelsInMaxRow) { pixelsInMaxRow = yPixelsPerRow[i]; maxPixelRowIndex = i; } } double minPercentage = 0.8; if (((double) pixelsInMaxRow / (double) shape.getHeight()) >= minPercentage) { RowOfShapes newRow = newRows.get(maxPixelRowIndex); newRow.addShape(shape); } else { // otherwise, the shape needs to be got rid of // as it's causing massive confusion // do this by simply not adding it anywhere } } // is the shape in one row exactly? } // next shape splitRows.put(row, newRows); } // do we have more than one row? } // next row for (RowOfShapes row : splitRows.keySet()) { List<RowOfShapes> newRows = splitRows.get(row); sourceImage.replaceRow(row, newRows); } }
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
void removeOversizedShapes(List<Shape> shapes) { LOG.debug("########## removeOversizedShapes #########"); Mean shapeHeightMean = new Mean(); Mean shapeWidthMean = new Mean(); for (Shape shape : shapes) { shapeHeightMean.increment(shape.getHeight()); shapeWidthMean.increment(shape.getWidth()); }/*w w w . j a va2s . c om*/ double heightMean = shapeHeightMean.getResult(); double widthMean = shapeWidthMean.getResult(); LOG.debug("heightMean: " + heightMean); LOG.debug("widthMean: " + widthMean); shapeHeightMean = new Mean(); shapeWidthMean = new Mean(); StandardDeviation shapeHeightStdDev = new StandardDeviation(); for (Shape shape : shapes) { if (shape.getHeight() > heightMean && shape.getHeight() < (heightMean * 2.0) && shape.getWidth() > widthMean && shape.getWidth() < (widthMean * 2.0)) { shapeHeightMean.increment(shape.getHeight()); shapeHeightStdDev.increment(shape.getHeight()); shapeWidthMean.increment(shape.getWidth()); } } heightMean = shapeHeightMean.getResult(); widthMean = shapeWidthMean.getResult(); LOG.debug("average shape heightMean: " + heightMean); LOG.debug("average shape widthMean: " + widthMean); double minHeightBigShape = heightMean * 6; double minWidthWideShape = widthMean * 6; double minHeightWideShape = heightMean * 1.5; double minHeightTallShape = heightMean * 2.5; double maxWidthTallShape = widthMean / 2; LOG.debug("minHeightBigShape: " + minHeightBigShape); LOG.debug("minWidthWideShape: " + minWidthWideShape); LOG.debug("minHeightWideShape: " + minHeightWideShape); LOG.debug("minHeightTallShape: " + minHeightTallShape); LOG.debug("maxWidthTallShape: " + maxWidthTallShape); List<Shape> largeShapes = new ArrayList<Shape>(); List<Shape> horizontalRules = new ArrayList<Shape>(); for (Shape shape : shapes) { if (shape.getHeight() > minHeightBigShape) { LOG.debug("Removing " + shape + " (height)"); largeShapes.add(shape); } else if (shape.getWidth() > minWidthWideShape && shape.getHeight() > minHeightWideShape) { // we don't want to remove horizontal bars, but we do want to remove other shapes. // why not? I suppose horizontal bars are easily represented as characters? LOG.debug("Removing " + shape + " (width)"); largeShapes.add(shape); } else if (shape.getWidth() > minWidthWideShape) { // ok, we will remove horizontal rules after all LOG.debug("Removing " + shape + " (horizontal rule)"); largeShapes.add(shape); horizontalRules.add(shape); } else if (shape.getWidth() <= maxWidthTallShape && shape.getHeight() > minHeightTallShape) { LOG.debug("Removing " + shape + " (narrow)"); largeShapes.add(shape); } } // Only want to remove enclosed shapes if the large shape isn't a frame/grid // A) first reduce the shape by 5 percent and see it's cardinality reduces vastly (in which case it's a frame) // if so, don't remove enclosed shapes // B) next, detect white rectangles within the shape - if they're big enough, don't remove enclosed shapes LOG.debug("Are large shapes frames or illustrations?"); double maxFrameCardinalityRatio = 0.5; double minFrameWhiteAreaSizeRatio = 0.9; List<Shape> illustrations = new ArrayList<Shape>(largeShapes); for (Shape largeShape : largeShapes) { LOG.debug(largeShape.toString()); int xOrigin = largeShape.getStartingPoint()[0] - largeShape.getLeft(); int yOrigin = largeShape.getStartingPoint()[1] - largeShape.getTop(); Shape dummyShape = graphicsService.getDot(sourceImage, xOrigin, yOrigin); // We want to fill up a mirror of the contiguous pixels within this shape, // which is what we'll use for further analysis to know // if it's a frame or not. WritableImageGrid mirror = graphicsService.getEmptyMirror(largeShape); this.findContiguousPixels(largeShape, mirror, dummyShape, xOrigin, yOrigin, sourceImage.getSeparationThreshold()); int adjustedLeft = (int) Math.round((double) mirror.getWidth() * 0.05); int adjustedRight = (int) Math.round((double) mirror.getWidth() * 0.95); int adjustedTop = (int) Math.round((double) mirror.getHeight() * 0.05); int adjustedBottom = (int) Math.round((double) mirror.getHeight() * 0.95); int cardinality = 0; int innerCardinality = 0; for (int x = 0; x < mirror.getWidth(); x++) { for (int y = 0; y < mirror.getHeight(); y++) { if (mirror.getPixel(x, y) > 0) { cardinality++; if (x >= adjustedLeft && x <= adjustedRight && y >= adjustedTop && y <= adjustedBottom) innerCardinality++; } } } LOG.debug("cardinality: " + cardinality); LOG.debug("innerCardinality: " + innerCardinality); double ratio = (double) innerCardinality / (double) cardinality; LOG.debug("ratio: " + ratio); if (ratio <= maxFrameCardinalityRatio) { LOG.debug("maxFrameCardinalityRatio: " + maxFrameCardinalityRatio); LOG.debug("Frame by cardinality! Removing from illustrations"); illustrations.remove(largeShape); } else { // Now, it could still be a grid // to find this out we need to detect white areas inside the shape. WhiteAreaFinder whiteAreaFinder = new WhiteAreaFinder(); double minWhiteAreaWidth = widthMean * 10; double minWhiteAreaHeight = heightMean * 4; List<Rectangle> whiteAreas = whiteAreaFinder.getWhiteAreas(mirror, 0, 0, 0, mirror.getWidth() - 1, mirror.getHeight() - 1, minWhiteAreaWidth, minWhiteAreaHeight); int whiteAreaSize = 0; for (Rectangle whiteArea : whiteAreas) { whiteAreaSize += (whiteArea.getWidth() * whiteArea.getHeight()); } int totalSize = mirror.getWidth() * mirror.getHeight(); LOG.debug("whiteAreaSize: " + whiteAreaSize); LOG.debug("totalSize: " + totalSize); double sizeRatio = (double) whiteAreaSize / (double) totalSize; LOG.debug("sizeRatio: " + sizeRatio); if (sizeRatio >= minFrameWhiteAreaSizeRatio) { LOG.debug("minFrameWhiteAreaSizeRatio: " + minFrameWhiteAreaSizeRatio); LOG.debug("Frame by white area size! Removing from illustrations"); illustrations.remove(largeShape); } } } for (Shape largeShape : illustrations) { // Add this to large shapes if it's not a "frame" // large shapes are used for paragraph detection sourceImage.getLargeShapes().add(largeShape); } // remove shapes that are enclosed inside illustrations List<Shape> enclosedShapesToDelete = new ArrayList<Shape>(); int extension = 5; for (Shape shape : shapes) { for (Shape shapeToDelete : illustrations) { if (shape.getLeft() >= shapeToDelete.getLeft() - extension && shape.getRight() <= shapeToDelete.getRight() + extension && shape.getTop() >= shapeToDelete.getTop() - extension && shape.getBottom() <= shapeToDelete.getBottom() + extension) { LOG.debug("Enclosed shape: " + shape); LOG.debug(" enclosed by " + shapeToDelete); enclosedShapesToDelete.add(shape); } } } shapes.removeAll(largeShapes); shapes.removeAll(enclosedShapesToDelete); // remove shapes that are practically touching horizontal rules (probably segments of the rule that got split) extension = 3; List<Shape> listToTestAgainst = horizontalRules; for (int i = 0; i < 3; i++) { List<Shape> horizontalRuleSegments = new ArrayList<Shape>(); for (Shape horizontalRule : listToTestAgainst) { for (Shape shape : shapes) { if ((shape.getLeft() <= horizontalRule.getRight() + extension || shape.getRight() >= horizontalRule.getLeft() - extension) && shape.getTop() >= horizontalRule.getTop() - extension && shape.getBottom() <= horizontalRule.getBottom() + extension) { LOG.debug("Horizontal rule segment: " + shape); LOG.debug(" touching " + horizontalRule); horizontalRuleSegments.add(shape); enclosedShapesToDelete.add(shape); } } } shapes.removeAll(horizontalRuleSegments); listToTestAgainst = horizontalRuleSegments; if (listToTestAgainst.size() == 0) break; } }
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
void groupShapesIntoWords(Set<RowOfShapes> rowCluster) { LOG.debug("Next row cluster of size " + rowCluster.size()); // group the shapes together into words Mean spaceMean = new Mean(); StandardDeviation spaceStdDev = new StandardDeviation(); int maxSpaceLog = 120; int[] spaceCounts = new int[maxSpaceLog]; List<Integer> spaces = new ArrayList<Integer>(); for (RowOfShapes row : rowCluster) { Shape previousShape = null; for (Shape shape : row.getShapes()) { if (previousShape != null) { int space = 0; if (sourceImage.isLeftToRight()) space = shape.getLeft() - previousShape.getRight(); else space = previousShape.getLeft() - shape.getRight(); LOG.trace(shape);/*from w w w. java 2s . c o m*/ LOG.trace("Space : " + space); if (space < maxSpaceLog && space >= 0) spaceCounts[space]++; if (space >= 0) { spaces.add(space); spaceMean.increment(space); spaceStdDev.increment(space); } } previousShape = shape; } // next shape } for (int i = 0; i < maxSpaceLog; i++) { //LOG.debug("Space count " + i + ": " + spaceCounts[i]); } double spaceMeanVal = spaceMean.getResult(); double spaceStdDevVal = spaceStdDev.getResult(); LOG.debug("Space mean: " + spaceMeanVal); LOG.debug("Space std dev: " + spaceStdDevVal); // If however there is only a single word on the row, the // standard deviation will be very low. boolean singleWord = false; if (spaceStdDevVal * 2 < spaceMeanVal) { LOG.debug("Assuming a single word per row"); singleWord = true; } // Since there should be two groups, one for letters and one for words, // the mean should be somewhere in between. We now look for the mean on the // lesser group and will use it as the basis for comparison. spaceMean = new Mean(); spaceStdDev = new StandardDeviation(); for (int space : spaces) { if (space < spaceMeanVal && space >= 0) { spaceMean.increment(space); spaceStdDev.increment(space); } } spaceMeanVal = spaceMean.getResult(); spaceStdDevVal = spaceStdDev.getResult(); LOG.debug("Letter space mean: " + spaceMeanVal); LOG.debug("Letter space std dev: " + spaceStdDevVal); int letterSpaceThreshold = 0; if (singleWord) letterSpaceThreshold = Integer.MAX_VALUE; else letterSpaceThreshold = (int) Math.round(spaceMeanVal + (4.0 * spaceStdDevVal)); for (RowOfShapes row : rowCluster) { LOG.debug(row.toString()); //row.getGroups().clear(); row.organiseShapesInGroups(letterSpaceThreshold); } // next row }