Example usage for org.apache.commons.math.stat.descriptive DescriptiveStatistics getPercentile

Introduction

In this page you can find the example usage for org.apache.commons.math.stat.descriptive DescriptiveStatistics getPercentile.

Prototype

public double getPercentile(double p)

Source Link

Document

Returns an estimate for the pth percentile of the stored values.

Usage

From source file:com.joliciel.jochre.graphics.SourceImageImpl.java

public SourceImageImpl(GraphicsServiceInternal graphicsService, String name, BufferedImage image) {
    super(image);
    this.name = name;
    this.setOriginalImage(image);
    this.setGraphicsService(graphicsService);

    this.setWidth(this.getPixelGrabber().getWidth());
    this.setHeight(this.getPixelGrabber().getHeight());

    // to normalise the image, we need to figure out where black and white are
    // we want to leave out anomalies (ink blots!)
    int[] pixelSpread = new int[256];

    // To save on memory
    for (int y = 0; y < this.getHeight(); y++)
        for (int x = 0; x < this.getWidth(); x++) {
            int pixel = this.getPixelGrabber().getPixelBrightness(x, y);
            pixelSpread[pixel]++;//from   w ww .  j a  v  a  2s  . c  o  m
        }

    if (LOG.isTraceEnabled()) {
        for (int i = 0; i < 256; i++)
            LOG.trace("Brightness " + i + ": " + pixelSpread[i]);
    }

    DescriptiveStatistics countStats = new DescriptiveStatistics();
    for (int i = 0; i < 256; i++) {
        countStats.addValue(pixelSpread[i]);
    }

    int startWhite = -1;
    int endWhite = -1;
    for (int i = 255; i >= 0; i--) {
        if (startWhite < 0 && pixelSpread[i] > countStats.getMean())
            startWhite = i;
        if (startWhite >= 0 && endWhite < 0 && pixelSpread[i] < countStats.getMean()) {
            endWhite = i;
            break;
        }
    }

    LOG.debug("Start white: " + startWhite);
    LOG.debug("End white: " + endWhite);

    DescriptiveStatistics blackCountStats = new DescriptiveStatistics();
    DescriptiveStatistics blackSpread = new DescriptiveStatistics();
    for (int i = 0; i <= endWhite; i++) {
        blackCountStats.addValue(pixelSpread[i]);
        for (int j = 0; j < pixelSpread[i]; j++) {
            blackSpread.addValue(i);
        }
    }

    LOG.debug("mean counts: " + countStats.getMean());
    LOG.debug("mean black counts: " + blackCountStats.getMean());
    LOG.debug("std dev black counts: " + blackCountStats.getStandardDeviation());

    int startBlack = -1;
    for (int i = 0; i < 256; i++) {
        if (pixelSpread[i] > blackCountStats.getMean()) {
            startBlack = i;
            break;
        }
    }
    LOG.debug("Start black: " + startBlack);

    this.setBlackLimit(startBlack);
    this.setWhiteLimit(startWhite);

    this.greyscaleMultiplier = (255.0 / (double) (whiteLimit - blackLimit));

    // use mean + 2 sigma to find the black threshold
    // we make the threshold high (darker) to put more pixels in the letter when analysing
    double blackthresholdCount = blackCountStats.getMean() + (2.0 * blackCountStats.getStandardDeviation());
    LOG.debug("blackthresholdCount: " + blackthresholdCount);

    int blackThresholdValue = endWhite;
    for (int i = endWhite; i >= startBlack; i--) {
        if (pixelSpread[i] < blackthresholdCount) {
            blackThresholdValue = i;
            break;
        }
    }
    LOG.debug("Black threshold value (old): " + blackThresholdValue);
    blackThreshold = (int) Math.round((blackThresholdValue - blackLimit) * greyscaleMultiplier);
    LOG.debug("Black threshold (old): " + blackThreshold);

    blackThresholdValue = (int) Math.round(blackSpread.getPercentile(60.0));
    LOG.debug("Black threshold value (new): " + blackThresholdValue);
    LOG.debug("Black spread 25 percentile: " + (int) Math.round(blackSpread.getPercentile(25.0)));
    LOG.debug("Black spread 50 percentile: " + (int) Math.round(blackSpread.getPercentile(50.0)));
    LOG.debug("Black spread 75 percentile: " + (int) Math.round(blackSpread.getPercentile(75.0)));

    blackThreshold = (int) Math.round((blackThresholdValue - blackLimit) * greyscaleMultiplier);
    LOG.debug("Black threshold (new): " + blackThreshold);

    // use mean + 1 sigma to find the separation threshold
    // we keep threshold low (1 sigma) to encourage letter breaks
    double separationthresholdCount = blackCountStats.getMean()
            + (1.0 * blackCountStats.getStandardDeviation());
    LOG.debug("Separation threshold value: " + separationthresholdCount);

    int separationThresholdValue = endWhite;
    for (int i = endWhite; i >= startBlack; i--) {
        if (pixelSpread[i] < separationthresholdCount) {
            separationThresholdValue = i;
            break;
        }
    }
    LOG.debug("Separation threshold value (old): " + separationThresholdValue);

    separationThresholdValue = (int) Math.round(blackSpread.getPercentile(75.0));
    LOG.debug("Separation threshold value (new): " + separationThresholdValue);
    LOG.debug("Black spread 25 percentile: " + (int) Math.round(blackSpread.getPercentile(25.0)));
    LOG.debug("Black spread 50 percentile: " + (int) Math.round(blackSpread.getPercentile(50.0)));
    LOG.debug("Black spread 75 percentile: " + (int) Math.round(blackSpread.getPercentile(75.0)));

    separationThreshold = (int) Math.round((separationThresholdValue - blackLimit) * greyscaleMultiplier);
    LOG.debug("Separation threshold: " + separationThreshold);

    if (drawPixelSpread)
        this.drawChart(pixelSpread, countStats, blackCountStats, blackSpread, startWhite, endWhite, startBlack,
                blackThresholdValue);
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

List<RowOfShapes> groupShapesIntoRows(SourceImage sourceImage, List<Shape> shapes, List<Rectangle> whiteAreas,
        boolean useSlope) {
    LOG.debug("########## groupShapesIntoRows #########");
    LOG.debug("useSlope? " + useSlope);

    List<RowOfShapes> rows = new ArrayList<RowOfShapes>();
    for (Shape shape : shapes)
        shape.setRow(null);//  ww  w .  j  ava 2s.c  o  m

    List<Shape> shapesToRemove = new ArrayList<Shape>();
    for (Shape shape : shapes) {
        for (Rectangle whiteArea : whiteAreas) {
            double whiteAreaRight = whiteArea.getRight();
            double whiteAreaLeft = whiteArea.getLeft();
            if (useSlope) {
                double xAdjustment = sourceImage.getXAdjustment(shape.getTop());

                whiteAreaRight += xAdjustment;
                whiteAreaLeft += xAdjustment;
            }

            if (whiteAreaRight > shape.getRight() && whiteAreaLeft < shape.getLeft()
                    && whiteArea.getTop() < shape.getTop() && whiteArea.getBottom() > shape.getBottom()) {
                // shape is surrounded
                shapesToRemove.add(shape);
                LOG.debug("Removing shape " + shape);
                LOG.debug("Surrounded by white area: " + whiteArea);
            }
        }
    }
    shapes.removeAll(shapesToRemove);

    // calculate the means
    // get average shape width & height
    DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics();
    for (Shape shape : shapes) {
        shapeWidthStats.addValue(shape.getWidth());
    }
    double averageShapeWidth = shapeWidthStats.getPercentile(50);
    LOG.debug("averageShapeWidth: " + averageShapeWidth);

    // now, arrange the shapes in rows
    // we're guaranteed that no two shapes overlap at this point.
    // Now, it's possible that two shapes in the same line have no vertical overlap (e.g. a comma and an apostrophe)
    // so we have to go searching a bit further afield, say five shapes in each direction
    // but if we go too far, we may end up joining two lines together if the page isn't quite straight

    // let's begin with any old shape and find the shapes closest to it horizontally
    // e.g. up to 8 horizontal means to the right and left
    // as we find shapes that go with it, we add them to the same line
    int i = 0;
    int j = 0;
    int numberOfMeanWidthsForSearch = 8;
    LOG.debug("numberOfMeanWidthsForSearch: " + numberOfMeanWidthsForSearch);
    LOG.debug("search distance: " + averageShapeWidth * numberOfMeanWidthsForSearch);

    for (Shape shape : shapes) {
        if (shape.getRow() == null) {
            RowOfShapes row = graphicsService.getEmptyRow(sourceImage);
            row.addShape(shape);
            row.setIndex(j++);
            rows.add(row);
            LOG.trace("========= New row " + row.getIndex() + "============");
            LOG.trace("Adding " + shape + " to row " + row.getIndex());
        }
        int searchLeft = (int) ((double) shape.getLeft() - (numberOfMeanWidthsForSearch * averageShapeWidth));
        int searchRight = (int) ((double) shape.getRight() + (numberOfMeanWidthsForSearch * averageShapeWidth));
        LOG.trace("Shape " + i++ + ": " + shape + "(row " + shape.getRow().getIndex() + ")");
        LOG.trace("searchLeft: " + searchLeft);
        LOG.trace("searchRight: " + searchRight);

        // construct an array to represent where white areas overlap with the search area
        int[][] leftSearchArea = new int[shape.getLeft() - searchLeft][2];
        int[][] rightSearchArea = new int[searchRight - shape.getRight()][2];
        for (int k = 0; k < leftSearchArea.length; k++) {
            leftSearchArea[k][0] = shape.getTop();
            leftSearchArea[k][1] = shape.getBottom();
        }
        for (int k = 0; k < rightSearchArea.length; k++) {
            rightSearchArea[k][0] = shape.getTop();
            rightSearchArea[k][1] = shape.getBottom();
        }

        int newSearchLeft = searchLeft;
        int newSearchRight = searchRight;
        for (Rectangle whiteArea : whiteAreas) {
            double whiteAreaRight = whiteArea.getRight();
            double whiteAreaLeft = whiteArea.getLeft();
            if (useSlope) {
                double xAdjustment = sourceImage.getXAdjustment(shape.getTop());

                whiteAreaRight += xAdjustment;
                whiteAreaLeft += xAdjustment;
                LOG.trace(whiteArea + ", xAdjustment=" + xAdjustment + " , whiteAreaLeft=" + whiteAreaLeft
                        + " , whiteAreaRight=" + whiteAreaRight);
            }

            if (whiteAreaRight > newSearchLeft && whiteAreaLeft < shape.getLeft()
                    && whiteArea.getTop() <= shape.getBottom() && whiteArea.getBottom() >= shape.getTop()) {

                LOG.trace("overlap on left with: " + whiteArea.toString());

                if (whiteArea.getTop() <= shape.getTop() && whiteArea.getBottom() >= shape.getBottom()
                        && whiteAreaRight > newSearchLeft) {
                    newSearchLeft = (int) Math.round(whiteAreaRight);
                    LOG.trace("Complete, newSearchLeft = " + newSearchLeft);
                } else {
                    LOG.trace("Partial, starting at " + whiteArea.getRight());
                    for (int k = whiteArea.getRight() - searchLeft; k >= 0; k--) {
                        if (k < leftSearchArea.length) {
                            if (whiteArea.getBottom() < shape.getBottom()
                                    && leftSearchArea[k][0] < whiteArea.getBottom())
                                leftSearchArea[k][0] = whiteArea.getBottom() + 1;
                            else if (whiteArea.getTop() > shape.getTop()
                                    && leftSearchArea[k][1] > whiteArea.getTop())
                                leftSearchArea[k][1] = whiteArea.getTop() - 1;

                            if (leftSearchArea[k][0] >= leftSearchArea[k][1]
                                    && searchLeft + k > newSearchLeft) {
                                newSearchLeft = searchLeft + k;
                                LOG.trace("Complete from " + newSearchLeft);
                                break;
                            }
                        }
                    }
                    //                  if (LOG.isTraceEnabled()) {
                    //                     StringBuilder sb = new StringBuilder();
                    //                     for (int k=0;k<leftSearchArea.length;k++) {
                    //                        String top = "" + (leftSearchArea[k][0]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", top)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                     sb = new StringBuilder();
                    //                     for (int k=0;k<leftSearchArea.length;k++) {
                    //                        String bottom = "" + (leftSearchArea[k][1]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", bottom)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                  }
                }
            } else if (whiteAreaLeft < newSearchRight && whiteAreaRight > shape.getRight()
                    && whiteArea.getTop() <= shape.getBottom() && whiteArea.getBottom() >= shape.getTop()) {
                LOG.trace("overlap on right with: " + whiteArea.toString());

                if (whiteArea.getTop() <= shape.getTop() && whiteArea.getBottom() >= shape.getBottom()
                        && newSearchRight > whiteAreaLeft) {
                    newSearchRight = (int) Math.round(whiteAreaLeft);
                    LOG.trace("Complete, newSearchRight = " + newSearchRight);

                } else {
                    LOG.trace("Partial, starting at " + whiteArea.getLeft());
                    for (int k = whiteArea.getLeft() - shape.getRight(); k < rightSearchArea.length; k++) {
                        if (k > 0 && k < leftSearchArea.length && k < rightSearchArea.length) {
                            if (whiteArea.getBottom() < shape.getBottom()
                                    && leftSearchArea[k][0] < whiteArea.getBottom())
                                rightSearchArea[k][0] = whiteArea.getBottom() + 1;
                            else if (whiteArea.getTop() > shape.getTop()
                                    && leftSearchArea[k][1] > whiteArea.getTop())
                                rightSearchArea[k][1] = whiteArea.getTop() - 1;

                            if (rightSearchArea[k][0] >= rightSearchArea[k][1]
                                    && newSearchRight > shape.getRight() + k) {
                                newSearchRight = shape.getRight() + k;
                                LOG.trace("Complete from " + newSearchRight);
                                break;
                            }
                        }
                    }
                    //                  if (LOG.isTraceEnabled()) {
                    //                     StringBuilder sb = new StringBuilder();
                    //                     for (int k=0;k<rightSearchArea.length;k++) {
                    //                        String top = "" + (rightSearchArea[k][0]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", top)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                     sb = new StringBuilder();
                    //                     for (int k=0;k<rightSearchArea.length;k++) {
                    //                        String bottom = "" + (rightSearchArea[k][1]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", bottom)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                  }
                }
            }
        }
        LOG.trace("searchLeft adjusted for white columns: " + newSearchLeft);
        LOG.trace("searchRight adjusted for white columns: " + newSearchRight);

        // min 10% overlap to assume same row
        double minOverlap = 0.10;

        for (Shape otherShape : shapes) {
            boolean haveSomeOverlap = false;
            if (!shape.getRow().equals(otherShape.getRow()) && !otherShape.equals(shape)) {

                // shapes are arranged from the top down
                if (otherShape.getTop() > shape.getBottom()) {
                    break;
                }

                if (otherShape.getRight() > newSearchLeft && otherShape.getRight() < shape.getLeft()
                        && otherShape.getTop() <= shape.getBottom()
                        && otherShape.getBottom() >= shape.getTop()) {
                    int k = otherShape.getRight() - searchLeft;
                    if (otherShape.getTop() <= leftSearchArea[k][1]
                            && otherShape.getBottom() >= leftSearchArea[k][0])
                        haveSomeOverlap = true;
                } else if (otherShape.getLeft() < newSearchRight && otherShape.getLeft() > shape.getRight()
                        && otherShape.getTop() <= shape.getBottom()
                        && otherShape.getBottom() >= shape.getTop()) {
                    int k = otherShape.getLeft() - shape.getRight();
                    if (otherShape.getTop() <= rightSearchArea[k][1]
                            && otherShape.getBottom() >= rightSearchArea[k][0])
                        haveSomeOverlap = true;
                }
                if (haveSomeOverlap) {
                    int overlap1 = shape.getBottom() - otherShape.getTop() + 1;
                    int overlap2 = otherShape.getBottom() - shape.getTop() + 1;
                    int overlap = overlap1 < overlap2 ? overlap1 : overlap2;
                    boolean addShapeToRow = false;
                    if ((((double) overlap / (double) shape.getHeight()) > minOverlap)
                            || (((double) overlap / (double) otherShape.getHeight()) > minOverlap)) {
                        addShapeToRow = true;
                    }

                    if (addShapeToRow) {
                        LOG.debug("Adding " + otherShape + " to row " + shape.getRow().getIndex());
                        if (otherShape.getRow() == null) {
                            shape.getRow().addShape(otherShape);
                        } else {
                            // two rows need to be merged
                            LOG.debug("========= Merge rows " + shape.getRow().getIndex() + " with "
                                    + otherShape.getRow().getIndex() + "==========");
                            RowOfShapes otherRow = otherShape.getRow();
                            shape.getRow().addShapes(otherRow.getShapes());
                            rows.remove(otherRow);
                        }
                    }
                } // add shape to row ?
            } // should shape be considered?
        } // next other shape
    } // next shape

    return rows;
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * We attempt to remove specks, where a speck is defined as 
 * a relatively small shape at a relatively large distance from other shapes.
 * @param sourceImage//from   w  ww  .  j a  v a2 s .  c  o  m
 */
void removeSpecks(SourceImage sourceImage, List<Shape> shapes) {
    LOG.debug("########## removeSpecks #########");

    DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics();
    DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics();

    for (Shape shape : shapes) {
        shapeWidthStats.addValue(shape.getWidth());
        shapeHeightStats.addValue(shape.getHeight());
    }

    double shapeWidthMedian = shapeWidthStats.getPercentile(65);
    double shapeHeightMedian = shapeHeightStats.getPercentile(65);
    LOG.debug("meanShapeWidth: " + shapeWidthMedian);
    LOG.debug("meanShapeHeight: " + shapeHeightMedian);

    int maxSpeckHeightFloor = (int) Math.ceil(shapeHeightMedian / 6.0);
    int maxSpeckWidthFloor = (int) Math.ceil(shapeWidthMedian / 6.0);
    int maxSpeckHeightCeiling = maxSpeckHeightFloor * 2;
    int maxSpeckWidthCeiling = maxSpeckWidthFloor * 2;

    int speckXDistanceThresholdFloor = (int) Math.floor(shapeWidthMedian);
    int speckYDistanceThresholdFloor = (int) Math.floor(shapeHeightMedian / 4.0);
    int speckXDistanceThresholdCeiling = speckXDistanceThresholdFloor * 2;
    int speckYDistanceThresholdCeiling = speckYDistanceThresholdFloor * 2;

    LOG.debug("maxSpeckHeightFloor=" + maxSpeckHeightFloor);
    LOG.debug("maxSpeckWidthFloor=" + maxSpeckWidthFloor);
    LOG.debug("speckXDistanceThresholdFloor=" + speckXDistanceThresholdFloor);
    LOG.debug("speckYDistanceThresholdFloor=" + speckYDistanceThresholdFloor);
    LOG.debug("maxSpeckHeightCeiling=" + maxSpeckHeightCeiling);
    LOG.debug("maxSpeckWidthCeiling=" + maxSpeckWidthCeiling);
    LOG.debug("speckXDistanceThresholdCeiling=" + speckXDistanceThresholdCeiling);
    LOG.debug("speckYDistanceThresholdCeiling=" + speckYDistanceThresholdCeiling);

    List<Shape> specks = new ArrayList<Shape>();
    List<double[]> speckCoordinates = new ArrayList<double[]>();
    for (Shape shape : shapes) {
        if (shape.getHeight() < maxSpeckHeightCeiling && shape.getWidth() < maxSpeckWidthCeiling) {
            specks.add(shape);
            speckCoordinates.add(shape.getCentrePoint());
        }
    }

    // group the specks into clusters, which will be added or removed as a whole
    // Note that a cluster could be a valid diacritic that's split into a few specks
    // or just a bunch of specks off on their own
    DBSCANClusterer<Shape> clusterer = new DBSCANClusterer<Shape>(specks, speckCoordinates);
    Set<Set<Shape>> speckClusters = clusterer.cluster(speckXDistanceThresholdFloor, 2, true);
    List<Shape> specksToRemove = new ArrayList<Shape>();
    for (Set<Shape> speckCluster : speckClusters) {

        int speckHeight = 0;
        int speckWidth = 0;
        int clusterTop = -1;
        int clusterBottom = -1;
        int clusterRight = -1;
        int clusterLeft = -1;
        for (Shape speck : speckCluster) {
            LOG.debug("Speck?, " + speck);
            if (speck.getWidth() > speckWidth)
                speckWidth = speck.getWidth();
            if (speck.getHeight() > speckHeight)
                speckHeight = speck.getHeight();

            if (clusterTop < 0 || speck.getTop() < clusterTop)
                clusterTop = speck.getTop();
            if (clusterLeft < 0 || speck.getLeft() < clusterLeft)
                clusterLeft = speck.getLeft();
            if (speck.getBottom() > clusterBottom)
                clusterBottom = speck.getBottom();
            if (speck.getRight() > clusterRight)
                clusterRight = speck.getRight();

        }

        boolean useWidth = speckWidth > speckHeight;
        double scale = 1.0;
        if (useWidth)
            scale = speckWidth < maxSpeckWidthFloor ? 0.0
                    : (speckWidth > maxSpeckWidthCeiling ? 1.0
                            : ((double) speckWidth - maxSpeckWidthFloor)
                                    / (maxSpeckWidthCeiling - maxSpeckWidthFloor));
        else
            scale = speckHeight < maxSpeckHeightFloor ? 0.0
                    : (speckHeight > maxSpeckHeightCeiling ? 1.0
                            : ((double) speckHeight - maxSpeckHeightFloor)
                                    / (maxSpeckHeightCeiling - maxSpeckHeightFloor));

        int speckXDistanceThreshold = (int) Math.ceil(speckXDistanceThresholdFloor
                + scale * (speckXDistanceThresholdCeiling - speckXDistanceThresholdFloor));
        int speckYDistanceThreshold = (int) Math.ceil(speckYDistanceThresholdFloor
                + scale * (speckYDistanceThresholdCeiling - speckYDistanceThresholdFloor));

        LOG.debug("speckHeight=" + speckHeight);
        LOG.debug("speckWidth=" + speckWidth);
        LOG.debug("speckXDistanceThreshold=" + speckXDistanceThreshold);
        LOG.debug("speckYDistanceThreshold=" + speckYDistanceThreshold);

        Shape nearestShape = null;
        double minDistance = 0.0;
        int nearestShapeXDiff = 0;
        int nearestShapeYDiff = 0;

        for (Shape otherShape : shapes) {
            // limit to nearby shapes
            if (otherShape.getTop() > clusterBottom + speckYDistanceThreshold + 1)
                break;
            if (otherShape.getBottom() < clusterTop - speckYDistanceThreshold - 1)
                continue;
            if (otherShape.getRight() < clusterLeft - speckXDistanceThreshold - 1)
                continue;
            if (otherShape.getLeft() > clusterRight + speckXDistanceThreshold + 1)
                continue;

            // Note: tried !specks.contains(otherShape), but sometimes we have a valid case
            // where a diacritic is "split" into two specks
            if (!specks.contains(otherShape)) {
                int xDiff = 0;
                int yDiff = 0;
                int leftDiff = 0;
                int rightDiff = 0;
                int topDiff = 0;
                int botDiff = 0;

                if (otherShape.getLeft() <= clusterRight && otherShape.getRight() >= clusterLeft) {
                    xDiff = 0;
                } else {
                    leftDiff = Math.abs(clusterLeft - otherShape.getRight());
                    rightDiff = Math.abs(clusterRight - otherShape.getLeft());
                    xDiff = (leftDiff < rightDiff) ? leftDiff : rightDiff;
                }

                if (otherShape.getTop() <= clusterBottom && otherShape.getBottom() >= clusterTop) {
                    yDiff = 0;
                } else {
                    int nearestTop = (otherShape.getTop() > otherShape.getTop() + otherShape.getMeanLine())
                            ? otherShape.getTop() + otherShape.getMeanLine()
                            : otherShape.getTop();
                    int nearestBot = (otherShape.getBottom() < otherShape.getTop() + otherShape.getBaseLine())
                            ? otherShape.getTop() + otherShape.getBaseLine()
                            : otherShape.getBottom();
                    topDiff = Math.abs(clusterTop - nearestBot);
                    botDiff = Math.abs(clusterBottom - nearestTop);
                    yDiff = (topDiff < botDiff) ? topDiff : botDiff;
                }

                double distance = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff));

                if (nearestShape == null || distance < minDistance) {
                    nearestShape = otherShape;
                    minDistance = distance;
                    nearestShapeXDiff = xDiff;
                    nearestShapeYDiff = yDiff;
                    LOG.trace("leftDiff=" + leftDiff + ", rightDiff=" + rightDiff);
                    LOG.trace("topDiff=" + topDiff + ", botDiff=" + botDiff);
                } // found closer shape?
            } // is this the speck?
        } // loop shapes around the reference shape

        if (nearestShape != null) {
            LOG.trace("Nearest shape, top(" + nearestShape.getTop() + ") " + "left(" + nearestShape.getLeft()
                    + ") " + "bot(" + nearestShape.getBottom() + ") " + "right(" + nearestShape.getRight()
                    + ")");
            LOG.trace("Distance=" + minDistance + ", xDiff=" + nearestShapeXDiff + ", yDiff="
                    + nearestShapeYDiff);
        }
        boolean removeSpecks = false;
        if (nearestShape == null)
            removeSpecks = true;
        else {
            // calculate the shortest distance from the nearest shape to the speck cluster
            for (Shape speck : speckCluster) {
                int xDiff = 0;
                int yDiff = 0;
                int leftDiff = 0;
                int rightDiff = 0;
                int topDiff = 0;
                int botDiff = 0;

                if (nearestShape.getLeft() <= speck.getRight() && nearestShape.getRight() >= speck.getLeft()) {
                    xDiff = 0;
                } else {
                    leftDiff = Math.abs(speck.getLeft() - nearestShape.getRight());
                    rightDiff = Math.abs(speck.getRight() - nearestShape.getLeft());
                    xDiff = (leftDiff < rightDiff) ? leftDiff : rightDiff;
                }

                if (nearestShape.getTop() <= speck.getBottom() && nearestShape.getBottom() >= speck.getTop()) {
                    yDiff = 0;
                } else {
                    int nearestTop = (nearestShape.getTop() > nearestShape.getTop()
                            + nearestShape.getMeanLine()) ? nearestShape.getTop() + nearestShape.getMeanLine()
                                    : nearestShape.getTop();
                    int nearestBot = (nearestShape.getBottom() < nearestShape.getTop()
                            + nearestShape.getBaseLine()) ? nearestShape.getTop() + nearestShape.getBaseLine()
                                    : nearestShape.getBottom();
                    topDiff = Math.abs(speck.getTop() - nearestBot);
                    botDiff = Math.abs(speck.getBottom() - nearestTop);
                    yDiff = (topDiff < botDiff) ? topDiff : botDiff;
                }

                double distance = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff));

                if (distance < minDistance) {
                    minDistance = distance;
                    nearestShapeXDiff = xDiff;
                    nearestShapeYDiff = yDiff;
                    LOG.debug("Found closer speck:");
                    LOG.debug("leftDiff=" + leftDiff + ", rightDiff=" + rightDiff);
                    LOG.debug("topDiff=" + topDiff + ", botDiff=" + botDiff);
                } // found closer shape?
            }
            // Then, for all of these specks, find the one that's closest to the nearest non-speck
            // if this distance > threshold, get rid of all of 'em
            // otherwise, keep 'em all
            if (nearestShapeXDiff > speckXDistanceThreshold || nearestShapeYDiff > speckYDistanceThreshold)
                removeSpecks = true;
        }
        if (removeSpecks) {
            for (Shape otherSpeck : speckCluster) {
                LOG.debug("Removing speck " + otherSpeck);
                specksToRemove.add(otherSpeck);
            }
        }
    } // next speck

    shapes.removeAll(specksToRemove);
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * Detects paragraph splits and assign rows to correct paragraphs.
 * @param sourceImage/*from   w w  w  .j  a v  a  2 s.  c o  m*/
 */
void groupRowsIntoParagraphs(SourceImage sourceImage) {
    LOG.debug("########## groupRowsIntoParagraphs #########");
    // We'll use various possible indicators, including
    // indented start, indented end, and spacing between rows.

    // On pages with a single big paragraph makes it hypersensitive to differences in row-start/row-end
    // This means we cannot use deviation. Instead, we use the average shape width on the page.
    // We also adjust maxLeft & minRight to match the vertical line slope

    // This is now complicated by the possibility of multiple columns

    // Need to take into account a big horizontal space - Pietrushka page 14
    // Find horizontal spaces that go all the way across and are wider than a certain threshold
    // simply do a boolean column and black out everything in a row, than see if there are any remaining spaces above a certain threshold
    // Columns are thus arranged into "areas", separated by white-space.
    boolean[] fullRows = new boolean[sourceImage.getHeight()];
    for (RowOfShapes row : sourceImage.getRows()) {
        for (int y = row.getTop(); y <= row.getBottom(); y++) {
            fullRows[y] = true;
        }
    }
    DescriptiveStatistics rowHeightStats = new DescriptiveStatistics();

    for (RowOfShapes row : sourceImage.getRows()) {
        int height = row.getXHeight();
        rowHeightStats.addValue(height);
    }
    double avgRowHeight = rowHeightStats.getPercentile(50);
    LOG.debug("meanRowHeight: " + avgRowHeight);
    double minHeightForWhiteSpace = avgRowHeight * 1.3;
    LOG.debug("minHeightForWhiteSpace: " + minHeightForWhiteSpace);

    // find the "white rows" - any horizontal white space
    // in the page which is sufficiently high
    List<int[]> whiteRows = new ArrayList<int[]>();
    boolean inWhite = false;
    int startWhite = 0;
    for (int y = 0; y < sourceImage.getHeight(); y++) {
        if (!inWhite && !fullRows[y]) {
            inWhite = true;
            startWhite = y;
        } else if (inWhite && fullRows[y]) {
            int length = y - startWhite;
            if (length > minHeightForWhiteSpace) {
                LOG.debug("Adding whiteRow " + startWhite + "," + (y - 1));
                whiteRows.add(new int[] { startWhite, y - 1 });
            }
            inWhite = false;
        }
    }
    if (inWhite)
        whiteRows.add(new int[] { startWhite, sourceImage.getHeight() - 1 });
    whiteRows.add(new int[] { sourceImage.getHeight(), sourceImage.getHeight() });

    // place rows in "areas" defined by the "white rows" found above
    List<List<RowOfShapes>> areas = new ArrayList<List<RowOfShapes>>();
    int startY = -1;
    for (int[] whiteRow : whiteRows) {
        List<RowOfShapes> area = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getTop() >= startY && row.getBottom() <= whiteRow[0]) {
                area.add(row);
            }
        }
        if (area.size() > 0) {
            areas.add(area);
        }
        startY = whiteRow[1];
    }

    // break up each area into vertical columns
    LOG.debug("break up each area into vertical columns");
    List<Column> columns = new ArrayList<Column>();
    List<List<Column>> columnsPerAreaList = new ArrayList<List<Column>>();
    for (List<RowOfShapes> area : areas) {
        LOG.debug("Next area");
        List<Column> columnsPerArea = new ArrayList<SegmenterImpl.Column>();
        columnsPerAreaList.add(columnsPerArea);
        TreeSet<RowOfShapes> rows = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator());
        rows.addAll(area);
        for (RowOfShapes row : rows) {
            // try to place this row in one of the columns directly above it.
            // this means that a row which overlaps more than one column has to "close" this column, so it is no longer considered
            List<Column> overlappingColumns = new ArrayList<Column>();
            for (Column column : columnsPerArea) {
                if (!column.closed) {
                    RowOfShapes lastRowInColumn = column.get(column.size() - 1);
                    if (row.getRight() - row.getXAdjustment() >= lastRowInColumn.getLeft()
                            - lastRowInColumn.getXAdjustment()
                            && row.getLeft() - row.getXAdjustment() <= lastRowInColumn.getRight()
                                    - lastRowInColumn.getXAdjustment()) {
                        overlappingColumns.add(column);
                    }
                }
            }
            if (overlappingColumns.size() == 1) {
                Column myColumn = overlappingColumns.get(0);
                RowOfShapes lastRowInMyColumn = myColumn.get(0);

                // close any columns that are now at a distance of more than one row
                for (Column column : columnsPerArea) {
                    if (!column.closed && !column.equals(myColumn)) {
                        RowOfShapes lastRowInColumn = column.get(column.size() - 1);
                        if (lastRowInMyColumn.getTop() > lastRowInColumn.getBottom()) {
                            column.closed = true;
                            LOG.debug("Closing distant column " + lastRowInColumn);
                        }
                    }
                }

                myColumn.add(row);
                LOG.debug(row.toString());
                LOG.debug("  added to column " + lastRowInMyColumn);
            } else {
                for (Column overlappingColumn : overlappingColumns) {
                    overlappingColumn.closed = true;
                    RowOfShapes lastRowInColumn = overlappingColumn.get(overlappingColumn.size() - 1);
                    LOG.debug("Closing overlapping column " + lastRowInColumn);
                }
                Column myColumn = new Column(sourceImage);
                myColumn.add(row);
                LOG.debug("Found new column");
                LOG.debug(row.toString());
                columns.add(myColumn);
                columnsPerArea.add(myColumn);
            }
        }
    } // next area

    for (Column column : columns)
        column.recalculate();

    // Intermediate step to reform the vertical columns, if they exist
    // basically the idea is that if the columns are aligned vertically, then the thresholds for paragraph indents
    // should be shared, to increase the statistical sample size and reduce anomalies.
    // We'll assume that two columns from two consecutive areas are in the same vertical group if they overlap with each other horizontally
    // and don't overlap with any other column in the other column's area.
    List<List<Column>> columnGroups = new ArrayList<List<Column>>();
    List<Column> columnsInPrevArea = null;
    for (List<Column> columnsPerArea : columnsPerAreaList) {
        if (columnsInPrevArea != null) {
            for (Column prevColumn : columnsInPrevArea) {
                LOG.debug("Checking " + prevColumn);
                // find the column group containing the previous column
                List<Column> myColumnGroup = null;
                for (List<Column> columnGroup : columnGroups) {
                    if (columnGroup.contains(prevColumn)) {
                        myColumnGroup = columnGroup;
                        break;
                    }
                }
                if (myColumnGroup == null) {
                    myColumnGroup = new ArrayList<SegmenterImpl.Column>();
                    LOG.debug("Creating column group for column " + prevColumn.toString());
                    columnGroups.add(myColumnGroup);
                    myColumnGroup.add(prevColumn);
                }

                // does only one column overlap with this one?
                Column overlappingColumn = null;
                for (Column column : columnsPerArea) {
                    if (column.adjustedRight >= prevColumn.adjustedLeft
                            && column.adjustedLeft <= prevColumn.adjustedRight) {
                        if (overlappingColumn == null) {
                            LOG.debug("I overlap with " + column);

                            overlappingColumn = column;
                        } else {
                            LOG.debug("But I overlap also with " + column);

                            overlappingColumn = null;
                            break;
                        }
                    }
                }
                if (overlappingColumn != null) {
                    // does it overlap with only me?
                    for (Column otherPrevColumn : columnsInPrevArea) {
                        if (otherPrevColumn.equals(prevColumn))
                            continue;
                        if (overlappingColumn.adjustedRight >= otherPrevColumn.adjustedLeft
                                && overlappingColumn.adjustedLeft <= otherPrevColumn.adjustedRight) {
                            LOG.debug("But it overlaps also with " + otherPrevColumn);
                            overlappingColumn = null;
                            break;
                        }
                    }
                }
                if (overlappingColumn != null) {
                    myColumnGroup.add(overlappingColumn);
                    LOG.debug("Adding " + overlappingColumn);
                    LOG.debug(" to group with " + prevColumn);
                }

            } // next previous column
        } // have previous columns
        columnsInPrevArea = columnsPerArea;
    } // next area
    if (columnsInPrevArea != null) {
        for (Column prevColumn : columnsInPrevArea) {
            // find the column group containing the previous column
            List<Column> myColumnGroup = null;
            for (List<Column> columnGroup : columnGroups) {
                if (columnGroup.contains(prevColumn)) {
                    myColumnGroup = columnGroup;
                    break;
                }
            }
            if (myColumnGroup == null) {
                myColumnGroup = new ArrayList<SegmenterImpl.Column>();
                LOG.debug("Creating column group for column " + prevColumn.toString());
                columnGroups.add(myColumnGroup);
                myColumnGroup.add(prevColumn);
            }
        }
    }

    // What we really want here is, for each column (in the case of right-to-left),
    // two clusters on the right
    // and one relatively big cluster on the left.
    // anything outside of the cluster on the left is an EOP.
    boolean hasTab = false;
    for (List<Column> columnGroup : columnGroups) {
        LOG.debug("Next column group");
        double averageShapeWidth = sourceImage.getAverageShapeWidth();
        LOG.debug("averageShapeWidth: " + averageShapeWidth);
        double epsilon = averageShapeWidth / 2.0;
        LOG.debug("epsilon: " + epsilon);

        int columnGroupTop = sourceImage.getHeight();
        int columnGroupBottom = 0;
        int columnGroupLeft = sourceImage.getWidth();
        int columnGroupRight = 0;
        for (Column column : columnGroup) {
            if (column.top < columnGroupTop)
                columnGroupTop = (int) Math.round(column.top);
            if (column.bottom > columnGroupBottom)
                columnGroupBottom = (int) Math.round(column.bottom);
            if (column.adjustedLeft < columnGroupLeft)
                columnGroupLeft = (int) Math.round(column.adjustedLeft);
            if (column.adjustedRight > columnGroupRight)
                columnGroupRight = (int) Math.round(column.adjustedRight);
        }

        // right thresholds
        LOG.debug("Calculating right thresholds");

        // first, create a DBScan cluster of all rows by their adjusted right coordinate
        List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>();
        List<double[]> rightCoordinates = new ArrayList<double[]>();

        for (Column column : columnGroup) {
            for (RowOfShapes row : column) {
                double right = row.getRight() - row.getXAdjustment();
                //               double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage);
                //               if (rightOverlap==0) {
                //                  // leave out any right-overlapping rows here
                //                  // since we need accurate statistics for margin detection
                //               // This is questionable - especially since a long vertical bar (see Petriushka)
                //               // tends to give all rows a left overlap. Also, because the overlap is calculated based
                //               // on the mean right & mean left, not based on any sort of margin clusters.
                //                  rightHandRows.add(row);
                //                  rightCoordinates.add(new double[] {right});
                //               }
                rightHandRows.add(row);
                rightCoordinates.add(new double[] { right });

            }
        }

        int minCardinalityForRightMargin = 5;
        DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows,
                rightCoordinates);
        Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(epsilon, minCardinalityForRightMargin,
                true);

        TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>(
                new CardinalityComparator<RowOfShapes>());
        orderedRowClusters.addAll(rowClusters);

        int i = 0;

        // find the two right-most clusters, and assume they are the margin & the tab
        DescriptiveStatistics rightMarginStats = null;
        DescriptiveStatistics rightTabStats = null;
        for (Set<RowOfShapes> cluster : orderedRowClusters) {
            DescriptiveStatistics rightStats = new DescriptiveStatistics();
            MeanAbsoluteDeviation rightDev = new MeanAbsoluteDeviation();
            for (RowOfShapes row : cluster) {
                int rowIndex = rightHandRows.indexOf(row);
                double right = rightCoordinates.get(rowIndex)[0];
                rightStats.addValue(right);
                rightDev.increment(right);
            }

            LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
            LOG.debug("Right mean : " + rightStats.getMean());
            LOG.debug("Right dev: " + rightDev.getResult());

            if (cluster.size() >= minCardinalityForRightMargin) {
                if (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean()) {
                    if (rightMarginStats != null)
                        rightTabStats = rightMarginStats;
                    rightMarginStats = rightStats;
                } else if (rightTabStats == null || rightTabStats.getMean() < rightStats.getMean()) {
                    rightTabStats = rightStats;
                }
            } else {
                break;
            }
            i++;
        } // next right-coordinate cluster

        double rightMargin = sourceImage.getWidth();
        double rightTab = sourceImage.getWidth();
        if (rightMarginStats != null) {
            rightMargin = rightMarginStats.getMean();
        } else {
            List<Rectangle> columnSeparators = sourceImage.findColumnSeparators();
            for (Rectangle columnSeparator : columnSeparators) {
                if (columnSeparator.getTop() <= columnGroupTop
                        && columnSeparator.getBottom() >= columnGroupBottom
                        && columnSeparator.getLeft() >= columnGroupRight) {
                    if (columnSeparator.getLeft() < rightMargin)
                        rightMargin = columnSeparator.getLeft();
                }
            }
        }
        if (rightTabStats != null) {
            rightTab = rightTabStats.getMean();
        }

        LOG.debug("rightMargin: " + rightMargin);
        LOG.debug("rightTab: " + rightTab);

        // left thresholds
        LOG.debug("Calculating left thresholds");

        // first, create a DBScan cluster of all rows by their adjusted left coordinate
        List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>();
        List<double[]> leftCoordinates = new ArrayList<double[]>();

        for (Column column : columnGroup) {
            for (RowOfShapes row : column) {
                double left = row.getLeft() - row.getXAdjustment();
                //               double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage);
                //               if (leftOverlap == 0) {
                //                  // leave out any overlapping rows from margin calcs,
                //                  // since we need accurate statistics here
                //                  leftHandRows.add(row);
                //                  leftCoordinates.add(new double[] {left});
                //               }
                leftHandRows.add(row);
                leftCoordinates.add(new double[] { left });
            }
        }

        int minCardinalityForLeftMargin = 5;
        DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows,
                leftCoordinates);
        Set<Set<RowOfShapes>> leftRowClusters = leftMarginClusterer.cluster(epsilon,
                minCardinalityForLeftMargin, true);

        TreeSet<Set<RowOfShapes>> orderedLeftRowClusters = new TreeSet<Set<RowOfShapes>>(
                new CardinalityComparator<RowOfShapes>());
        orderedLeftRowClusters.addAll(leftRowClusters);

        i = 0;

        // find the two left-most clusters, and assume they are the margin & the tab
        DescriptiveStatistics leftMarginStats = null;
        DescriptiveStatistics leftTabStats = null;
        for (Set<RowOfShapes> cluster : orderedLeftRowClusters) {
            DescriptiveStatistics leftStats = new DescriptiveStatistics();
            MeanAbsoluteDeviation leftDev = new MeanAbsoluteDeviation();
            for (RowOfShapes row : cluster) {
                int rowIndex = leftHandRows.indexOf(row);
                double left = leftCoordinates.get(rowIndex)[0];
                leftStats.addValue(left);
                leftDev.increment(left);
            }

            LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
            LOG.debug("Left mean : " + leftStats.getMean());
            LOG.debug("Left dev: " + leftDev.getResult());

            if (cluster.size() >= minCardinalityForLeftMargin) {
                if (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean()) {
                    if (leftMarginStats != null)
                        leftTabStats = leftMarginStats;
                    leftMarginStats = leftStats;
                } else if (leftTabStats == null || leftTabStats.getMean() > leftStats.getMean()) {
                    leftTabStats = leftStats;
                }
            } else {
                break;
            }
            i++;
        } // next left-coordinate cluster

        double leftMargin = 0;
        double leftTab = 0;
        if (leftMarginStats != null) {
            leftMargin = leftMarginStats.getMean();
        } else {
            List<Rectangle> columnSeparators = sourceImage.findColumnSeparators();
            for (Rectangle columnSeparator : columnSeparators) {
                if (columnSeparator.getTop() <= columnGroupTop
                        && columnSeparator.getBottom() >= columnGroupBottom
                        && columnSeparator.getRight() <= columnGroupLeft) {
                    if (columnSeparator.getRight() > leftMargin)
                        leftMargin = columnSeparator.getRight();
                }
            }
        }
        if (leftTabStats != null) {
            leftTab = leftTabStats.getMean();
        }

        LOG.debug("leftMargin: " + leftMargin);
        LOG.debug("leftTab: " + leftTab);

        for (Column column : columnGroup) {
            if (sourceImage.isLeftToRight()) {
                column.startMargin = leftMargin;
                if (leftTabStats != null) {
                    column.startTab = leftTab;
                    column.hasTab = true;
                } else {
                    LOG.debug("No left tab - setting based on left margin");
                    column.startTab = leftMargin + (5.0 * sourceImage.getAverageShapeWidth());
                    column.hasTab = false;
                }

                column.endMargin = rightMargin;
            } else {
                column.startMargin = rightMargin;
                if (rightTabStats != null) {
                    column.startTab = rightTab;
                    column.hasTab = true;
                } else {
                    LOG.debug("No right tab - setting based on right margin");
                    column.startTab = rightMargin - (5.0 * sourceImage.getAverageShapeWidth());
                    column.hasTab = false;
                }

                column.endMargin = leftMargin;
            }
            LOG.debug("Margins for " + column);
            LOG.debug("startMargin: " + column.startMargin);
            LOG.debug("startTab: " + column.startTab);
            LOG.debug("endMargin: " + column.endMargin);
        } // next column
    } // next column group
    LOG.debug("hasTab: " + hasTab);

    double safetyMargin = 1.5 * sourceImage.getAverageShapeWidth();

    // Now, paragraphs are either "indented", "outdented" or not "dented" at all (no tabs).
    // This applies to the entire page.
    // To recognise indenting vs. outdenting, we have to see if the row preceding each
    // indent/outdent is full or partial. In the case of indentation, partial rows will
    // typically be followed by an indent. In the case of outdentation, partial rows will
    // typically be followed by an outdent.
    boolean isIndented = true;

    int indentCount = 0;
    int outdentCount = 0;
    for (List<Column> columnGroup : columnGroups) {
        LOG.debug("Next column group");
        boolean prevRowPartial = false;
        for (Column column : columnGroup) {
            if (column.hasTab) {
                for (RowOfShapes row : column) {
                    if (sourceImage.isLeftToRight()) {
                        if (prevRowPartial) {
                            if (row.getLeft() - row.getXAdjustment() > column.startTab - safetyMargin) {
                                indentCount++;
                            } else if (row.getLeft() - row.getXAdjustment() < column.startMargin
                                    + safetyMargin) {
                                outdentCount++;
                            }
                        }
                        if (row.getRight() - row.getXAdjustment() < column.endMargin - safetyMargin) {
                            prevRowPartial = true;
                        } else {
                            prevRowPartial = false;
                        }
                    } else {
                        if (prevRowPartial) {
                            if (row.getRight() - row.getXAdjustment() < column.startTab + safetyMargin) {
                                indentCount++;
                            } else if (row.getRight() - row.getXAdjustment() > column.startMargin
                                    - safetyMargin) {
                                outdentCount++;
                            }
                        }
                        if (row.getLeft() - row.getXAdjustment() > column.endMargin + safetyMargin) {
                            prevRowPartial = true;
                        } else {
                            prevRowPartial = false;
                        }
                    } // left-to-right?
                } // next row  
            } // column has tab
        } // next column
    } // next column group
    isIndented = (indentCount + 2 >= outdentCount);
    LOG.debug("indentCount: " + indentCount);
    LOG.debug("outdentCount: " + outdentCount);
    LOG.debug("isIndented: " + isIndented);

    // order the columns
    TreeSet<Column> orderedColumns = new TreeSet<SegmenterImpl.Column>(columns);
    columns.clear();
    columns.addAll(orderedColumns);

    // find the paragraphs found in each column
    for (Column column : columns) {
        LOG.debug("--- Next column ---");

        // break up the column into paragraphs 
        Paragraph paragraph = null;
        RowOfShapes previousRow = null;
        int maxShapesForStandaloneParagraph = 2;
        List<RowOfShapes> rowsForStandaloneParagraphs = new ArrayList<RowOfShapes>();
        Point2D previousPointStartMargin = null;
        Point2D previousPointStartTab = null;
        Point2D previousPointEndMargin = null;

        for (RowOfShapes row : column) {
            boolean rowForStandaloneParagraph = false;
            boolean newParagraph = false;
            if (row.getShapes().size() <= maxShapesForStandaloneParagraph) {
                rowsForStandaloneParagraphs.add(row);
                rowForStandaloneParagraph = true;
            } else {
                double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage);
                double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage);

                if (drawSegmentation) {
                    double rowVerticalMidPoint = row.getBaseLineMiddlePoint();
                    double startMarginX = column.startMargin + row.getXAdjustment();
                    double startTabX = column.startTab + row.getXAdjustment();
                    double endMarginX = column.endMargin + row.getXAdjustment();

                    if (sourceImage.isLeftToRight()) {
                        startMarginX += safetyMargin;
                        startTabX -= safetyMargin;
                        endMarginX -= safetyMargin;

                        startMarginX += leftOverlap;
                        startTabX += leftOverlap;
                        endMarginX -= rightOverlap;
                    } else {
                        startMarginX -= safetyMargin;
                        startTabX += safetyMargin;
                        endMarginX += safetyMargin;

                        startMarginX -= rightOverlap;
                        startTabX -= rightOverlap;
                        endMarginX += leftOverlap;
                    }

                    Point2D.Double currentPointStartMargin = new Point2D.Double(startMarginX,
                            rowVerticalMidPoint);
                    Point2D.Double currentPointStartTab = new Point2D.Double(startTabX, rowVerticalMidPoint);
                    Point2D.Double currentPointEndMargin = new Point2D.Double(endMarginX, rowVerticalMidPoint);

                    if (previousPointStartMargin != null) {
                        graphics2D.setStroke(new BasicStroke(1));
                        graphics2D.setPaint(Color.BLUE);
                        graphics2D.drawLine((int) Math.round(previousPointStartMargin.getX()),
                                (int) Math.round(previousPointStartMargin.getY()),
                                (int) Math.round(currentPointStartMargin.getX()),
                                (int) Math.round(currentPointStartMargin.getY()));
                        graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()),
                                (int) Math.round(previousPointEndMargin.getY()),
                                (int) Math.round(currentPointEndMargin.getX()),
                                (int) Math.round(currentPointEndMargin.getY()));

                        graphics2D.setPaint(Color.RED);
                        graphics2D.drawLine((int) Math.round(previousPointStartTab.getX()),
                                (int) Math.round(previousPointStartTab.getY()),
                                (int) Math.round(currentPointStartTab.getX()),
                                (int) Math.round(currentPointStartTab.getY()));

                        graphics2D.setPaint(Color.RED);
                        graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()),
                                (int) Math.round(previousPointEndMargin.getY()),
                                (int) Math.round(currentPointEndMargin.getX()),
                                (int) Math.round(currentPointEndMargin.getY()));
                    }
                    previousPointStartMargin = currentPointStartMargin;
                    previousPointStartTab = currentPointStartTab;
                    previousPointEndMargin = currentPointEndMargin;
                }

                if (previousRow == null) {
                    LOG.debug("New paragraph (first)");
                    newParagraph = true;
                } else {
                    if (sourceImage.isLeftToRight()) {
                        if (previousRow.getRight() - previousRow.getXAdjustment()
                                - rightOverlap < column.endMargin - safetyMargin) {
                            LOG.debug("New paragraph (previous EOP)");
                            newParagraph = true;
                        } else if (column.hasTab && isIndented && row.getLeft() - row.getXAdjustment()
                                + leftOverlap > column.startTab - safetyMargin) {
                            LOG.debug("New paragraph (indent)");
                            newParagraph = true;
                        } else if (column.hasTab && !isIndented && row.getLeft() - row.getXAdjustment()
                                + leftOverlap < column.startMargin + safetyMargin) {
                            LOG.debug("New paragraph (outdent)");
                            newParagraph = true;
                        }
                    } else {
                        if (previousRow.getLeft() - previousRow.getXAdjustment()
                                + leftOverlap > column.endMargin + safetyMargin) {
                            LOG.debug("New paragraph (previous EOP)");
                            newParagraph = true;
                        } else if (column.hasTab && isIndented && row.getRight() - row.getXAdjustment()
                                - rightOverlap < column.startTab + safetyMargin) {
                            LOG.debug("New paragraph (indent)");
                            newParagraph = true;
                        } else if (column.hasTab && !isIndented && row.getRight() - row.getXAdjustment()
                                - rightOverlap > column.startMargin - safetyMargin) {
                            LOG.debug("New paragraph (outdent)");
                            newParagraph = true;
                        }
                    } // left-to-right?
                } // have previous row
            } // standalone paragraph?

            if (!rowForStandaloneParagraph)
                LOG.debug(row.toString());

            if (newParagraph) {
                if (rowsForStandaloneParagraphs.size() > 0) {
                    for (RowOfShapes oneRow : rowsForStandaloneParagraphs) {
                        LOG.debug("Standalone paragraph");
                        LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop()
                                + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")");
                        Paragraph standaloneParagraph = sourceImage.newParagraph();
                        standaloneParagraph.getRows().add(oneRow);
                    }
                    rowsForStandaloneParagraphs.clear();
                }
                paragraph = sourceImage.newParagraph();
            }
            //LOG.debug("Row: left(" + row.getLeft() + "), right(" + row.getRight() + "), width(" + (row.getRight() - row.getLeft() + 1) + ")");

            if (!rowForStandaloneParagraph) {
                paragraph.getRows().add(row);
                previousRow = row;
            }
        } // next row in column
        if (rowsForStandaloneParagraphs.size() > 0) {
            for (RowOfShapes oneRow : rowsForStandaloneParagraphs) {
                LOG.debug("Standalone paragraph");
                LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right("
                        + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")");
                Paragraph standaloneParagraph = sourceImage.newParagraph();
                standaloneParagraph.getRows().add(oneRow);
            }
            rowsForStandaloneParagraphs.clear();
        }
    } // next column

}

From source file:org.a3badran.platform.logging.writer.MetricsWriter.java

private Map<String, Long> getAllMetrics() {
    Map<String, Long> metrics = new HashMap<String, Long>();
    for (Entry<String, DescriptiveStatistics> entry : sampleMetrics.entrySet()) {
        // create a copy to reduce locking
        String name = entry.getKey();
        DescriptiveStatistics stats = entry.getValue().copy();
        metrics.put(name + ".sampleCount", (long) stats.getN());
        metrics.put(name + ".max", (long) stats.getMax());
        metrics.put(name + ".min", (long) stats.getMin());
        metrics.put(name + ".avg", (long) stats.getMean());
        metrics.put(name + ".50p", (long) stats.getPercentile(50));
        metrics.put(name + ".90p", (long) stats.getPercentile(90));
        metrics.put(name + ".99p", (long) stats.getPercentile(99));
    }/*from  w  w w .  ja  v  a 2s . c om*/

    for (Entry<String, DescriptiveStatistics> cEntry : sampleCounterMetrics.entrySet()) {
        // create a copy to reduce locking
        String cName = cEntry.getKey();
        DescriptiveStatistics cStats = cEntry.getValue().copy();
        metrics.put(cName + ".max", (long) cStats.getMax());
        metrics.put(cName + ".min", (long) cStats.getMin());
        metrics.put(cName + ".avg", (long) cStats.getMean());
        metrics.put(cName + ".50p", (long) cStats.getPercentile(50));
        metrics.put(cName + ".90p", (long) cStats.getPercentile(90));
        metrics.put(cName + ".99p", (long) cStats.getPercentile(99));
    }

    for (Entry<String, AtomicLong> entry : scopeTotalMetrics.entrySet()) {
        metrics.put(entry.getKey(), entry.getValue().longValue());
    }

    for (Entry<String, AtomicLong> entry : appTotalMetrics.entrySet()) {
        metrics.put(entry.getKey(), entry.getValue().longValue());
    }

    return metrics;
}

From source file:org.apache.jackrabbit.oak.benchmark.AbstractTest.java

private void runTest(RepositoryFixture fixture, Repository repository, List<Integer> concurrencyLevels)
        throws Exception {

    setUp(repository, CREDENTIALS);/*from  w  w w .j  a  v a 2  s  .  c om*/
    try {

        // Run a few iterations to warm up the system
        long warmupEnd = System.currentTimeMillis() + WARMUP;
        boolean stop = false;
        while (System.currentTimeMillis() < warmupEnd && !stop) {
            if (!stop) {
                // we want to execute this at lease once. after that we consider the
                // `haltRequested` flag.
                stop = haltRequested;
            }
            execute();
        }

        if (concurrencyLevels == null || concurrencyLevels.isEmpty()) {
            concurrencyLevels = Arrays.asList(1);
        }

        for (Integer concurrency : concurrencyLevels) {
            // Run the test
            DescriptiveStatistics statistics = runTest(concurrency);
            if (statistics.getN() > 0) {
                System.out.format("%-28.28s  %6d  %6.0f  %6.0f  %6.0f  %6.0f  %6.0f  %6d%n", fixture.toString(),
                        concurrency, statistics.getMin(), statistics.getPercentile(10.0),
                        statistics.getPercentile(50.0), statistics.getPercentile(90.0), statistics.getMax(),
                        statistics.getN());
                if (out != null) {
                    out.format("%-28.28s, %6d, %6.0f, %6.0f, %6.0f, %6.0f, %6.0f, %6d%n", fixture.toString(),
                            concurrency, statistics.getMin(), statistics.getPercentile(10.0),
                            statistics.getPercentile(50.0), statistics.getPercentile(90.0), statistics.getMax(),
                            statistics.getN());
                }
            }

        }
    } finally {
        tearDown();
    }
}

From source file:org.apache.jackrabbit.performance.AbstractPerformanceTest.java

private void writeReport(String test, String name, DescriptiveStatistics statistics) throws IOException {
    File report = new File("target", test + ".txt");

    boolean needsPrefix = !report.exists();
    PrintWriter writer = new PrintWriter(new FileWriterWithEncoding(report, "UTF-8", true));
    try {// www .j a  v  a 2  s.c  o  m
        if (needsPrefix) {
            writer.format("# %-34.34s     min     10%%     50%%     90%%     max%n", test);
        }

        writer.format("%-36.36s  %6.0f  %6.0f  %6.0f  %6.0f  %6.0f%n", name, statistics.getMin(),
                statistics.getPercentile(10.0), statistics.getPercentile(50.0), statistics.getPercentile(90.0),
                statistics.getMax());
    } finally {
        writer.close();
    }
}

From source file:org.apache.sling.performance.PerformanceRecord.java

/**
 * Checks internal statistics against <code>reference</code>. Current implementation looks at 50 percentile.
 *
 * @param reference Reference statistics
 * @return An error string if threshold is exceeded, <code>null</code> if not
 *///from  w w w. ja v a 2s.  c o  m
public String checkThreshold(DescriptiveStatistics reference) {
    if (threshold == null || threshold.doubleValue() <= 0) {
        return null;
    }
    double ratio = this.statistics.getPercentile(50) / reference.getPercentile(50);
    if (ratio > threshold.doubleValue()) {
        return String.format("Threshold exceeded! Expected <%6.2f, actual %6.2f", threshold.doubleValue(),
                ratio);
    }
    return null;
}

From source file:org.datacleaner.beans.NumberAnalyzer.java

@Override
public NumberAnalyzerResult getResult() {
    CrosstabDimension measureDimension = new CrosstabDimension(DIMENSION_MEASURE);
    measureDimension.addCategory(MEASURE_ROW_COUNT);
    measureDimension.addCategory(MEASURE_NULL_COUNT);
    measureDimension.addCategory(MEASURE_HIGHEST_VALUE);
    measureDimension.addCategory(MEASURE_LOWEST_VALUE);
    measureDimension.addCategory(MEASURE_SUM);
    measureDimension.addCategory(MEASURE_MEAN);
    measureDimension.addCategory(MEASURE_GEOMETRIC_MEAN);
    measureDimension.addCategory(MEASURE_STANDARD_DEVIATION);
    measureDimension.addCategory(MEASURE_VARIANCE);
    measureDimension.addCategory(MEASURE_SECOND_MOMENT);
    measureDimension.addCategory(MEASURE_SUM_OF_SQUARES);

    if (descriptiveStatistics) {
        measureDimension.addCategory(MEASURE_MEDIAN);
        measureDimension.addCategory(MEASURE_PERCENTILE25);
        measureDimension.addCategory(MEASURE_PERCENTILE75);
        measureDimension.addCategory(MEASURE_SKEWNESS);
        measureDimension.addCategory(MEASURE_KURTOSIS);
    }/*  ww  w. j  ava2  s .  c  o m*/

    CrosstabDimension columnDimension = new CrosstabDimension(DIMENSION_COLUMN);
    for (InputColumn<? extends Number> column : _columns) {
        columnDimension.addCategory(column.getName());
    }

    Crosstab<Number> crosstab = new Crosstab<Number>(Number.class, columnDimension, measureDimension);
    for (InputColumn<? extends Number> column : _columns) {
        CrosstabNavigator<Number> nav = crosstab.navigate().where(columnDimension, column.getName());
        NumberAnalyzerColumnDelegate delegate = _columnDelegates.get(column);

        StatisticalSummary s = delegate.getStatistics();
        int nullCount = delegate.getNullCount();

        nav.where(measureDimension, MEASURE_NULL_COUNT).put(nullCount);

        if (nullCount > 0) {
            addAttachment(nav, delegate.getNullAnnotation(), column);
        }

        int numRows = delegate.getNumRows();
        nav.where(measureDimension, MEASURE_ROW_COUNT).put(numRows);

        long nonNullCount = s.getN();

        if (nonNullCount > 0) {
            final double highestValue = s.getMax();
            final double lowestValue = s.getMin();
            final double sum = s.getSum();
            final double mean = s.getMean();
            final double standardDeviation = s.getStandardDeviation();
            final double variance = s.getVariance();

            final double geometricMean;
            final double secondMoment;
            final double sumOfSquares;
            if (descriptiveStatistics) {
                final DescriptiveStatistics descriptiveStats = (DescriptiveStatistics) s;
                geometricMean = descriptiveStats.getGeometricMean();
                sumOfSquares = descriptiveStats.getSumsq();
                secondMoment = new SecondMoment().evaluate(descriptiveStats.getValues());
            } else {
                final SummaryStatistics summaryStats = (SummaryStatistics) s;
                geometricMean = summaryStats.getGeometricMean();
                secondMoment = summaryStats.getSecondMoment();
                sumOfSquares = summaryStats.getSumsq();
            }

            nav.where(measureDimension, MEASURE_HIGHEST_VALUE).put(highestValue);
            addAttachment(nav, delegate.getMaxAnnotation(), column);

            nav.where(measureDimension, MEASURE_LOWEST_VALUE).put(lowestValue);
            addAttachment(nav, delegate.getMinAnnotation(), column);

            nav.where(measureDimension, MEASURE_SUM).put(sum);
            nav.where(measureDimension, MEASURE_MEAN).put(mean);
            nav.where(measureDimension, MEASURE_GEOMETRIC_MEAN).put(geometricMean);
            nav.where(measureDimension, MEASURE_STANDARD_DEVIATION).put(standardDeviation);
            nav.where(measureDimension, MEASURE_VARIANCE).put(variance);
            nav.where(measureDimension, MEASURE_SUM_OF_SQUARES).put(sumOfSquares);
            nav.where(measureDimension, MEASURE_SECOND_MOMENT).put(secondMoment);

            if (descriptiveStatistics) {
                final DescriptiveStatistics descriptiveStatistics = (DescriptiveStatistics) s;
                final double kurtosis = descriptiveStatistics.getKurtosis();
                final double skewness = descriptiveStatistics.getSkewness();
                final double median = descriptiveStatistics.getPercentile(50.0);
                final double percentile25 = descriptiveStatistics.getPercentile(25.0);
                final double percentile75 = descriptiveStatistics.getPercentile(75.0);

                nav.where(measureDimension, MEASURE_MEDIAN).put(median);
                nav.where(measureDimension, MEASURE_PERCENTILE25).put(percentile25);
                nav.where(measureDimension, MEASURE_PERCENTILE75).put(percentile75);
                nav.where(measureDimension, MEASURE_SKEWNESS).put(skewness);
                nav.where(measureDimension, MEASURE_KURTOSIS).put(kurtosis);
            }
        }
    }
    return new NumberAnalyzerResult(_columns, crosstab);
}

From source file:org.fusesource.eca.processor.StatisticsCalculator.java

protected void process(StatisticsType type, Number value, ObjectNode statsNode) throws Exception {
    EventCache<Number> cache = this.eventCache;
    if (value != null && cache != null) {
        cache.add(value);//from w w  w  . j  av a 2s  .  c  o  m
        if (type.equals(StatisticsType.RATE)) {
            calculateRate(statsNode);
        } else {
            List<Number> list = this.eventCache.getWindow();
            DescriptiveStatistics descriptiveStatistics = new DescriptiveStatistics();
            if (list != null && !list.isEmpty()) {
                for (Number number : list) {
                    descriptiveStatistics.addValue(number.doubleValue());
                }
                switch (type) {
                case MEAN:
                    statsNode.put("mean", descriptiveStatistics.getMean());
                    break;
                case GEOMETRIC_MEAN:
                    statsNode.put("gemetric mean", descriptiveStatistics.getGeometricMean());
                    break;
                case STDDEV:
                    statsNode.put("std-dev", descriptiveStatistics.getStandardDeviation());
                    break;
                case MIN:
                    statsNode.put("minimum", descriptiveStatistics.getMin());
                    break;
                case MAX:
                    statsNode.put("maximum", descriptiveStatistics.getMax());
                    break;
                case SKEWNESS:
                    statsNode.put("skewness", descriptiveStatistics.getSkewness());
                    break;
                case KUTOSIS:
                    statsNode.put("kurtosis", descriptiveStatistics.getKurtosis());
                    break;
                case VARIANCE:
                    statsNode.put("variance", descriptiveStatistics.getVariance());
                    break;
                case COUNT:
                    statsNode.put("count", list.size());
                default:
                    statsNode.put("number", descriptiveStatistics.getN());
                    statsNode.put("mean", descriptiveStatistics.getMean());
                    statsNode.put("gemetric mean", descriptiveStatistics.getGeometricMean());
                    statsNode.put("minimum", descriptiveStatistics.getMin());
                    statsNode.put("maximum", descriptiveStatistics.getMax());
                    statsNode.put("std-dev", descriptiveStatistics.getStandardDeviation());
                    statsNode.put("median", descriptiveStatistics.getPercentile(50));
                    statsNode.put("skewness", descriptiveStatistics.getSkewness());
                    statsNode.put("kurtosis", descriptiveStatistics.getKurtosis());
                    statsNode.put("variance", descriptiveStatistics.getVariance());
                    calculateRate(statsNode);
                    statsNode.put("count", list.size());
                }
            }
        }

    }
}