List of usage examples for org.apache.commons.math.stat.descriptive.moment Mean clear
@Override public void clear()
From source file:com.joliciel.jochre.graphics.SegmenterImpl.java
void splitShapes(SourceImage sourceImage, int fillFactor) { LOG.debug("########## splitShapes #########"); // Cluster rows into rows of a similar height // Once we have this, we look for any shapes that are wider than average // and attempt to split them by looking for any bridges that are considerable thinner // than the stroke thickness and yet have big pixel counts on either side. // In order to split, we need four parameters // 1) minShapeWidth: the minimum shape width to consider for a split // 2) maxBridgeWidth: the maximum bridge width to use as a dividing bridge between two shapes when splitting // 3) minLetterWeight: the minimum pixel count that can represent a separate letter when splitting // 4) maxHorizontalOverlap: the maximum horizontal overlap between the left-hand and right-hand shape // These parameters are different for different font sizes // Therefore, we first need to group the rows on the image into clusters by height double imageShapeMean = sourceImage.getAverageShapeWidth(); double maxWidthForSplit = imageShapeMean * 6.0; // avoid splitting horizontal rules! Set<Set<RowOfShapes>> rowClusters = sourceImage.getRowClusters(); for (Set<RowOfShapes> rowCluster : rowClusters) { LOG.debug("Analysing row cluster"); // 1) minShapeWidth: calculate the minimum shape width to be considered for splitting // first get the mean Mean meanWidth = new Mean(); List<Shape> shapes = new ArrayList<Shape>(); for (RowOfShapes row : rowCluster) { for (Shape shape : row.getShapes()) { meanWidth.increment(shape.getWidth()); shapes.add(shape);/*from w w w. j av a 2 s. c o m*/ } } double shapeWidthMean = meanWidth.getResult(); LOG.debug("Mean width: " + shapeWidthMean); meanWidth.clear(); // Note: there is much trial and error for these numbers // but the general guideline is that it is easier to deal downstream // with bad joins than with bad splits // so we prefer to err on the upper side double fillFactorScale = 0.15 * fillFactor; double widthForSplittingLower = shapeWidthMean * (1.6 + fillFactorScale); double widthForSplittingUpper = shapeWidthMean * (2.2 + fillFactorScale); LOG.debug("widthForSplittingLower: " + widthForSplittingLower); LOG.debug("widthForSplittingUpper: " + widthForSplittingUpper); LOG.debug("maxWidthForSplit: " + maxWidthForSplit); List<Shape> candidates = new ArrayList<Shape>(); for (RowOfShapes row : rowCluster) { LOG.debug("Next row " + row.getIndex()); for (Shape shape : row.getShapes()) { LOG.trace("Shape width " + shape.getWidth()); if (shape.getWidth() > widthForSplittingLower && shape.getWidth() < maxWidthForSplit) { candidates.add(shape); LOG.debug("Found candidate with width " + shape.getWidth() + ": " + shape); } } } if (candidates.size() > 0) { // we'll take a random sampling of shapes for the next parameters int sampleSize = 30; List<Shape> sample = this.getSample(rowCluster, sampleSize, true); Mean meanPixelCount = new Mean(); Vectorizer vectorizer = this.graphicsService.getVectorizer(); List<Integer> thicknesses = new ArrayList<Integer>(); for (Shape shape : sample) { BitSet bitset = shape.getBlackAndWhiteBitSet(sourceImage.getSeparationThreshold(), 0); meanPixelCount.increment(bitset.cardinality()); List<LineSegment> vectors = vectorizer.vectorize(shape); int height = shape.getHeight(); int sampleStep = (int) Math.ceil(height / 8); for (LineSegment vector : vectors) { List<Integer> vectorThickness = vector.getLineDefinition().findArrayListThickness(shape, vector.getStartX(), vector.getStartY(), vector.getLength(), sourceImage.getSeparationThreshold(), 0, sampleStep); thicknesses.addAll(vectorThickness); } } double pixelCountMean = meanPixelCount.getResult(); Mean meanThickness = new Mean(); for (int thickness : thicknesses) { meanThickness.increment(thickness); } double thicknessMean = meanThickness.getResult(); meanThickness = new Mean(); for (int thickness : thicknesses) { if (thickness < thicknessMean) meanThickness.increment(thickness); } thicknessMean = meanThickness.getResult(); LOG.debug("thicknessMean: " + thicknessMean); // 2) maxBridgeWidth: the maximum bridge width to use as a dividing bridge between two shapes when splitting double maxBridgeWidthLower = thicknessMean * 0.5; double maxBridgeWidthUpper = thicknessMean * 0.8; LOG.debug("maxBridgeWidthLower: " + maxBridgeWidthLower); LOG.debug("maxBridgeWidthUpper: " + maxBridgeWidthUpper); // 3) minLetterWeight: the minimum pixel count that can represent a separate letter when splitting int minLetterWeight = (int) Math.floor(pixelCountMean / 4.0); LOG.debug("minLetterWeight: " + minLetterWeight); // 4) maxHorizontalOverlap: the maximum horizontal overlap between the left-hand and right-hand shape int maxOverlap = (int) Math.ceil(shapeWidthMean / 8.0); LOG.debug("maxOverlap: " + maxOverlap); Map<Shape, List<Shape>> shapesToSplit = new Hashtable<Shape, List<Shape>>(); for (Shape candidate : candidates) { LOG.debug("Trying to split candidate " + candidate); for (int y = 0; y < candidate.getHeight(); y++) { String line = ""; if (y == candidate.getMeanLine()) line += "M"; else if (y == candidate.getBaseLine()) line += "B"; else line += y; for (int x = 0; x < candidate.getWidth(); x++) { if (candidate.isPixelBlack(x, y, sourceImage.getBlackThreshold())) line += "x"; else line += "o"; } LOG.debug(line); } if (candidate.getHeight() < 3.0 * maxBridgeWidthUpper) { LOG.debug("Shape too narrow - probably a long dash."); continue; } int maxBridgeWidth; if (candidate.getWidth() > widthForSplittingUpper) maxBridgeWidth = (int) Math.ceil(maxBridgeWidthUpper); else { // since many bridges are thicker than expected // add a rule that the thicker the bridge is, the wider the image needs to be maxBridgeWidth = (int) Math.ceil( maxBridgeWidthLower + (((double) candidate.getWidth() - widthForSplittingLower) / (widthForSplittingUpper - widthForSplittingLower) * (maxBridgeWidthUpper - maxBridgeWidthLower))); } List<Shape> splitShapes = this.splitShape(candidate, sourceImage, maxBridgeWidth, minLetterWeight, maxOverlap); if (splitShapes.size() > 1) { LOG.debug("Split found"); for (Shape splitShape : splitShapes) { splitShape.setRow(candidate.getRow()); } shapesToSplit.put(candidate, splitShapes); } } LOG.debug("Replacing shapes with split shapes"); List<RowOfShapes> rowsToReorder = new ArrayList<RowOfShapes>(); for (Shape shape : shapesToSplit.keySet()) { List<Shape> newShapes = shapesToSplit.get(shape); RowOfShapes row = shape.getRow(); row.removeShape(shape); row.addShapes(newShapes); rowsToReorder.add(row); } for (RowOfShapes row : rowsToReorder) row.reorderShapes(); } } LOG.debug("splitShapes complete"); }