Example usage for org.apache.commons.math.stat.descriptive.moment StandardDeviation getResult

List of usage examples for org.apache.commons.math.stat.descriptive.moment StandardDeviation getResult

Introduction

In this page you can find the example usage for org.apache.commons.math.stat.descriptive.moment StandardDeviation getResult.

Prototype

@Override
public double getResult() 

Source Link

Usage

From source file:com.joliciel.jochre.graphics.SourceImageImpl.java

@Override
public Set<Set<RowOfShapes>> getRowClusters() {
    if (rowClusters == null) {
        Mean heightMean = new Mean();
        StandardDeviation heightStdDev = new StandardDeviation();
        List<double[]> rowHeights = new ArrayList<double[]>(this.getRows().size());
        for (RowOfShapes row : this.getRows()) {
            Shape shape = row.getShapes().iterator().next();
            int height = shape.getBaseLine() - shape.getMeanLine();
            rowHeights.add(new double[] { height });
            heightMean.increment(height);
            heightStdDev.increment(height);
        }//  ww  w .  j av a2 s .c o  m

        double stdDevHeight = heightStdDev.getResult();
        List<RowOfShapes> rows = new ArrayList<RowOfShapes>(this.getRows());
        DBSCANClusterer<RowOfShapes> clusterer = new DBSCANClusterer<RowOfShapes>(rows, rowHeights);
        rowClusters = clusterer.cluster(stdDevHeight, 2, true);
        LOG.debug("Found " + rowClusters.size() + " row clusters.");
    }
    return rowClusters;
}

From source file:com.joliciel.jochre.graphics.RowOfShapesImpl.java

/**
 * Assign guidelines for a certain subset of shapes, and return the x-height.
 * @param startShape/* ww w.  j  a va  2s . c o  m*/
 * @param endShape
 * @return
 */
int assignGuideLines(List<GroupOfShapes> groupsToAssign) {
    LOG.debug("assignGuideLines internal");
    double meanHorizontalSlope = this.getContainer().getMeanHorizontalSlope();

    // the base-line and mean-line will be at a fixed distance away from the midpoint
    // the question is, which distance!
    // To find this out, we count number of black pixels on each row above this line
    // And then start analysing from the top and the bottom until the number drops off sharply

    // The notion of "groupsToAssign" is used to only assign guidelines
    // to a subset of the groups on the line
    // when the line contains two different font sizes
    List<Shape> shapes = new ArrayList<Shape>();
    if (groupsToAssign != null) {
        for (GroupOfShapes group : groupsToAssign) {
            shapes.addAll(group.getShapes());
        }
    } else {
        shapes = this.getShapes();
    }

    int i = 0;
    DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics();
    DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics();

    for (Shape shape : this.getShapes()) {
        shapeWidthStats.addValue(shape.getWidth());
        shapeHeightStats.addValue(shape.getHeight());
    }

    double minWidth = shapeWidthStats.getPercentile(25);
    double maxWidth = shapeWidthStats.getPercentile(75);
    double minHeight = shapeHeightStats.getPercentile(45);
    double maxHeight = shapeHeightStats.getPercentile(75);

    double rowMidPointX = (double) (this.getLeft() + this.getRight()) / 2.0;

    // calculating the Y midpoint by the shapes in the row, instead of by the top & bottom of row
    Mean rowMidPointYMean = new Mean();
    for (Shape shape : this.getShapes()) {
        // only add points whose shape is of "average" width and height (to leave out commas, etc.)
        if (shape.getWidth() >= minWidth && shape.getWidth() <= maxWidth && shape.getHeight() >= minHeight
                && shape.getHeight() <= maxHeight) {
            rowMidPointYMean.increment((double) (shape.getBottom() + shape.getTop()) / 2.0);
        }
    }

    double rowMidPointY = (double) (this.getTop() + this.getBottom()) / 2.0;
    if (rowMidPointYMean.getN() > 0)
        rowMidPointY = rowMidPointYMean.getResult();
    LOG.debug("rowMidPointX: " + rowMidPointX);
    LOG.debug("rowMidPointY: " + rowMidPointY);

    // figure out where the top-most shape starts and the bottom-most shape ends, relative to the y midline
    int minTop = Integer.MAX_VALUE;
    int maxBottom = Integer.MIN_VALUE;
    List<Integer> rowYMidPoints = new ArrayList<Integer>(shapes.size());
    for (Shape shape : shapes) {
        double shapeMidPointX = (double) (shape.getLeft() + shape.getRight()) / 2.0;
        int shapeMidPointY = (int) Math
                .round(rowMidPointY + (meanHorizontalSlope * (shapeMidPointX - rowMidPointX)));
        rowYMidPoints.add(shapeMidPointY);

        int relativeTop = shape.getTop() - shapeMidPointY;
        int relativeBottom = shape.getBottom() - shapeMidPointY;

        if (relativeTop < minTop)
            minTop = relativeTop;
        if (relativeBottom > maxBottom)
            maxBottom = relativeBottom;
    }
    if (minTop > 0)
        minTop = 0;
    if (maxBottom < 0)
        maxBottom = 0;

    int yIntervalTop = 0 - minTop;
    int yIntervalBottom = maxBottom;
    int yInterval = yIntervalTop + 1 + yIntervalBottom;
    LOG.debug("yIntervalTop: " + yIntervalTop);
    LOG.debug("yIntervalBottom: " + yIntervalBottom);
    LOG.debug("yInterval: " + yInterval);
    int[] pixelCounts = new int[yInterval];

    // Get the pixel count for each row
    // examining one shape at a time to limit ourselves to the pixels that are
    // actually considered to be in this row
    int blackThreshold = this.getContainer().getSeparationThreshold();
    int shapeIndex = 0;
    int shapeCount = 0;
    for (Shape shape : shapes) {
        if (shape.getHeight() >= minHeight) {
            LOG.trace(shape.toString());
            shapeCount++;
            int shapeMidPointY = rowYMidPoints.get(shapeIndex);
            int zeroLine = shapeMidPointY - yIntervalTop;
            int topIndex = shape.getTop() - zeroLine;
            for (int x = 0; x < shape.getWidth(); x++) {
                for (int y = 0; y < shape.getHeight(); y++) {
                    int yIndex = topIndex + y;
                    if (yIndex >= 0 && yIndex < pixelCounts.length
                            && shape.isPixelBlack(x, y, blackThreshold)) {
                        pixelCounts[yIndex]++;
                    }
                }
            }
        }
        shapeIndex++;
    }
    LOG.debug("Got pixels from " + shapeCount + " shapes.");

    boolean notEnoughShapes = shapeCount < 3;
    LOG.debug("notEnoughShapes? " + notEnoughShapes);

    // We start at the top
    // As soon as we reach a line with more pixels than the mean, we assume this is the mean-line
    Mean pixelCountMeanTop = new Mean();
    StandardDeviation pixelCountStdDevTop = new StandardDeviation();
    for (i = 0; i <= yIntervalTop; i++) {
        pixelCountMeanTop.increment(pixelCounts[i]);
        pixelCountStdDevTop.increment(pixelCounts[i]);
    }
    LOG.debug("Top: pixel count mean: " + pixelCountMeanTop.getResult() + ", std dev: "
            + pixelCountStdDevTop.getResult());

    double threshold = pixelCountMeanTop.getResult() * 1.1;
    if (notEnoughShapes) {
        threshold = threshold / 2.0;
    }
    double lowerThreshold = threshold / 2.0;

    LOG.debug("Top threshold: " + threshold);
    LOG.debug("Top lowerThreshold: " + lowerThreshold);

    int meanLine = 0;
    boolean findMeanLine = true;
    for (i = 0; i <= yIntervalTop; i++) {
        int pixelCount = pixelCounts[i];
        if (findMeanLine && pixelCount > threshold) {
            meanLine = i;
            findMeanLine = false;
        } else if (!findMeanLine && pixelCount < lowerThreshold) {
            findMeanLine = true;
        }
    }

    // We start at the bottom
    // As soon as we reach a line with more pixels than the mean, we assume this is the base-line

    Mean pixelCountMeanBottom = new Mean();
    StandardDeviation pixelCountStdDevBottom = new StandardDeviation();
    for (i = pixelCounts.length - 1; i >= yIntervalTop; i--) {
        pixelCountMeanBottom.increment(pixelCounts[i]);
        pixelCountStdDevBottom.increment(pixelCounts[i]);
    }
    LOG.debug("Bottom: pixel count mean: " + pixelCountMeanBottom.getResult() + ", std dev: "
            + pixelCountStdDevBottom.getResult());

    threshold = pixelCountMeanBottom.getResult() * 1.1;
    if (notEnoughShapes) {
        threshold = threshold / 2.0;
    }
    lowerThreshold = threshold / 2.0;

    LOG.debug("Bottom threshold: " + threshold);
    LOG.debug("Bottom lowerThreshold: " + lowerThreshold);
    int baseLine = meanLine;
    boolean findBaseLine = true;
    for (i = pixelCounts.length - 1; i >= yIntervalTop; i--) {
        int pixelCount = pixelCounts[i];
        if (findBaseLine && pixelCount > threshold) {
            baseLine = i;
            findBaseLine = false;
        } else if (!findBaseLine && pixelCount < lowerThreshold) {
            findBaseLine = true;
        }
    }

    for (i = 0; i < yInterval; i++) {
        int pixelCount = pixelCounts[i];
        if (i == meanLine)
            LOG.trace("======= MEAN LINE " + i + " ==========");
        LOG.trace("pixel row " + i + ". pixel count " + pixelCount);
        if (i == baseLine)
            LOG.trace("======= BASE LINE " + i + " ==========");
    }

    // assign base lines and mean lines to each shape
    shapeIndex = 0;
    for (Shape shape : shapes) {
        int shapeMidPointY = rowYMidPoints.get(shapeIndex);
        int yMeanline = (shapeMidPointY - yIntervalTop) + meanLine;
        int yBaseline = (shapeMidPointY - yIntervalTop) + baseLine;
        LOG.trace(shape.toString() + ", meanLine: " + (yMeanline - shape.getTop()) + ", baseLine: "
                + (yBaseline - shape.getTop()));
        shape.setBaseLine(yBaseline - shape.getTop());
        shape.setMeanLine(yMeanline - shape.getTop());
        shapeIndex++;
    } // next shape

    int xHeight = baseLine - meanLine;
    return xHeight;
}

From source file:com.joliciel.jochre.graphics.SourceImageImpl.java

@Override
public double getMeanHorizontalSlope() {
    if (!meanHorizontalSlopeCalculated) {
        // Calculate the average regression to be used for analysis
        Mean meanForSlope = new Mean();
        StandardDeviation stdDevForSlope = new StandardDeviation();
        List<SimpleRegression> regressions = new ArrayList<SimpleRegression>();
        for (RowOfShapes row : this.getRows()) {
            SimpleRegression regression = row.getRegression();
            // only include rows for which regression was really calculated (more than 2 points)
            if (regression.getN() > 2) {
                meanForSlope.increment(regression.getSlope());
                stdDevForSlope.increment(regression.getSlope());
                regressions.add(regression);
            }/* ww w  .  j  ava  2s  .c  o m*/
        }

        double slopeMean = 0.0;
        double slopeStdDev = 0.0;

        if (meanForSlope.getN() > 0) {
            slopeMean = meanForSlope.getResult();
            slopeStdDev = stdDevForSlope.getResult();
        }
        LOG.debug("slopeMean: " + slopeMean);
        LOG.debug("slopeStdDev: " + slopeStdDev);

        if (regressions.size() > 0) {
            double minSlope = slopeMean - slopeStdDev;
            double maxSlope = slopeMean + slopeStdDev;
            meanForSlope = new Mean();
            for (SimpleRegression regression : regressions) {
                if (minSlope <= regression.getSlope() && regression.getSlope() <= maxSlope)
                    meanForSlope.increment(regression.getSlope());
            }

            meanHorizontalSlope = meanForSlope.getResult();
        } else {
            meanHorizontalSlope = 0.0;
        }
        LOG.debug("meanHorizontalSlope: " + meanHorizontalSlope);
        meanHorizontalSlopeCalculated = true;
    }
    return meanHorizontalSlope;
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * Split rows if they're particularly high, and contain considerable white space in the middle.
 * Shapes causing the join will be removed if too high, or attached to the closest row otherwise.
 * @param sourceImage//from  www .j a v a  2  s . co m
 * @param regressions
 * @return
 */
void splitRows(SourceImage sourceImage) {
    LOG.debug("########## splitRows #########");

    // Calculate the min row height to be considered for splitting
    double minHeightForSplit = sourceImage.getAverageShapeHeight();
    LOG.debug("minHeightForSplit: " + minHeightForSplit);

    double slopeMean = sourceImage.getMeanHorizontalSlope();

    List<RowOfShapes> candidateRows = new ArrayList<RowOfShapes>();
    for (RowOfShapes row : sourceImage.getRows()) {
        if (row.getRight() == row.getLeft())
            continue;
        int height = row.getBottom() - row.getTop();
        if (height >= minHeightForSplit) {
            LOG.debug("Adding candidate " + row.toString());
            candidateRows.add(row);
        }
    }

    // For each row to be considered for splitting, see if there are lines of white space inside it.
    Hashtable<RowOfShapes, List<RowOfShapes>> splitRows = new Hashtable<RowOfShapes, List<RowOfShapes>>();
    for (RowOfShapes row : candidateRows) {
        SimpleRegression regression = new SimpleRegression();
        // y = intercept + slope * x 
        LOG.debug("Left point: (" + row.getLeft() + " , " + row.getTop() + ")");
        regression.addData(row.getLeft(), row.getTop());
        double rightHandY = row.getTop() + ((double) (row.getRight() - row.getLeft()) * slopeMean);
        LOG.debug("Right point: (" + row.getRight() + " , " + rightHandY + ")");
        regression.addData(row.getRight(), rightHandY);

        int yDelta = (int) Math.ceil(Math.abs(rightHandY - (double) row.getTop()));
        int yInterval = yDelta + (row.getBottom() - row.getTop() + 1) + yDelta;

        LOG.debug("yDelta: " + yDelta);
        LOG.debug("yInterval: " + yInterval);
        // let's get pixel counts shape by shape, and leave out the rest (in case rows overlap vertically)
        int[] pixelCounts = new int[yInterval];
        for (Shape shape : row.getShapes()) {
            LOG.trace("Shape " + shape);
            int yDeltaAtLeft = (int) Math.round(regression.predict(shape.getLeft()));
            LOG.trace("yDeltaAtLeft: " + yDeltaAtLeft);
            // the shape offset + the offset between the regression line and the row top
            // + the delta we left at the start in case the line slopes upwards to the right
            int topIndex = (shape.getTop() - row.getTop()) + (row.getTop() - yDeltaAtLeft) + yDelta;
            LOG.trace("topIndex: (" + shape.getTop() + " - " + row.getTop() + ") + (" + row.getTop() + " - "
                    + yDeltaAtLeft + ") + " + yDelta + " = " + topIndex);
            for (int x = 0; x < shape.getWidth(); x++) {
                for (int y = 0; y < shape.getHeight(); y++) {
                    if (shape.isPixelBlack(x, y, sourceImage.getBlackThreshold())) {
                        pixelCounts[topIndex + y]++;
                    }
                }
            }
        }

        Mean pixelCountMean = new Mean();
        StandardDeviation pixelCountStdDev = new StandardDeviation();
        for (int i = 0; i < yInterval; i++) {
            LOG.debug("Pixel count " + i + ": " + pixelCounts[i]);
            pixelCountMean.increment(pixelCounts[i]);
            pixelCountStdDev.increment(pixelCounts[i]);
        }
        LOG.debug("pixel count mean: " + pixelCountMean.getResult() + ", std dev: "
                + pixelCountStdDev.getResult());

        // If there's a split required, we're going to go considerably above and below the mean several times
        double lowThreshold = pixelCountMean.getResult() / 2.0;
        double highThreshold = pixelCountMean.getResult() * 2.0;
        boolean inRow = false;
        List<Integer> switches = new ArrayList<Integer>();
        for (int i = 0; i < yInterval; i++) {
            if (!inRow && pixelCounts[i] > highThreshold) {
                LOG.debug("In row at " + i + ", pixel count " + pixelCounts[i]);
                inRow = true;
                switches.add(i);
            } else if (inRow && pixelCounts[i] < lowThreshold) {
                LOG.debug("Out of row at " + i + ", pixel count " + pixelCounts[i]);
                inRow = false;
                switches.add(i);
            }
        }
        if (switches.size() > 2) {
            // we have more than one row
            List<Integer> rowSeparations = new ArrayList<Integer>();

            // find the row separators
            for (int switchIndex = 1; switchIndex < switches.size() - 2; switchIndex = switchIndex + 2) {
                int outOfRow = switches.get(switchIndex);
                int intoRow = switches.get(switchIndex + 1);
                int minPixelCount = (int) Math.ceil(highThreshold);
                int minIndex = -1;
                // find the row with the lowest pixel count
                for (int i = outOfRow; i <= intoRow; i++) {
                    if (pixelCounts[i] < minPixelCount) {
                        minPixelCount = pixelCounts[i];
                        minIndex = i;
                    }
                }
                rowSeparations.add(minIndex);
            }

            // separate the shapes among the rows
            List<RowOfShapes> newRows = new ArrayList<RowOfShapes>(rowSeparations.size() + 1);
            for (int i = 0; i <= rowSeparations.size(); i++) {
                newRows.add(graphicsService.getEmptyRow(sourceImage));
            }

            // add a separator at the beginning and end
            rowSeparations.add(0, 0);
            rowSeparations.add(yInterval + 1);
            for (Shape shape : row.getShapes()) {
                int yDeltaAtLeft = (int) Math.round(regression.predict(shape.getLeft()));
                int topIndex = (shape.getTop() - row.getTop()) + (row.getTop() - yDeltaAtLeft) + yDelta;
                int firstSepAfterShapeBottom = rowSeparations.size();
                int lastSepBeforeShapeTop = -1;

                for (int i = rowSeparations.size() - 1; i >= 0; i--) {
                    int rowSeparation = rowSeparations.get(i);
                    if (rowSeparation <= topIndex) {
                        lastSepBeforeShapeTop = i;
                        break;
                    }
                }

                for (int i = 0; i < rowSeparations.size(); i++) {
                    int rowSeparation = rowSeparations.get(i);
                    if (rowSeparation >= topIndex + shape.getHeight()) {
                        firstSepAfterShapeBottom = i;
                        break;
                    }
                }

                if (lastSepBeforeShapeTop == firstSepAfterShapeBottom - 1) {
                    // shape clearly belongs to one row
                    RowOfShapes newRow = newRows.get(lastSepBeforeShapeTop);
                    newRow.addShape(shape);
                } else {
                    // is the shape much closer to one row than another?
                    // if yes, add it to then add it to this row
                    int[] yPixelsPerRow = new int[newRows.size()];
                    for (int i = 0; i < newRows.size(); i++) {
                        int separatorTop = rowSeparations.get(i);
                        int separatorBottom = rowSeparations.get(i + 1);
                        int top = topIndex < separatorTop ? separatorTop : topIndex;
                        int bottom = topIndex + shape.getHeight() < separatorBottom
                                ? topIndex + shape.getHeight()
                                : separatorBottom;
                        yPixelsPerRow[i] = bottom - top;
                    }

                    int pixelsInMaxRow = 0;
                    int maxPixelRowIndex = -1;
                    for (int i = 0; i < newRows.size(); i++) {
                        if (yPixelsPerRow[i] > pixelsInMaxRow) {
                            pixelsInMaxRow = yPixelsPerRow[i];
                            maxPixelRowIndex = i;
                        }
                    }
                    double minPercentage = 0.8;
                    if (((double) pixelsInMaxRow / (double) shape.getHeight()) >= minPercentage) {
                        RowOfShapes newRow = newRows.get(maxPixelRowIndex);
                        newRow.addShape(shape);
                    } else {
                        // otherwise, the shape needs to be got rid of
                        // as it's causing massive confusion
                        // do this by simply not adding it anywhere
                    }
                } // is the shape in one row exactly?
            } // next shape
            splitRows.put(row, newRows);
        } // do we have more than one row?
    } // next row

    for (RowOfShapes row : splitRows.keySet()) {
        List<RowOfShapes> newRows = splitRows.get(row);
        sourceImage.replaceRow(row, newRows);
    }
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

void groupShapesIntoWords(Set<RowOfShapes> rowCluster) {
    LOG.debug("Next row cluster of size " + rowCluster.size());
    // group the shapes together into words
    Mean spaceMean = new Mean();
    StandardDeviation spaceStdDev = new StandardDeviation();
    int maxSpaceLog = 120;
    int[] spaceCounts = new int[maxSpaceLog];
    List<Integer> spaces = new ArrayList<Integer>();

    for (RowOfShapes row : rowCluster) {
        Shape previousShape = null;
        for (Shape shape : row.getShapes()) {
            if (previousShape != null) {
                int space = 0;
                if (sourceImage.isLeftToRight())
                    space = shape.getLeft() - previousShape.getRight();
                else
                    space = previousShape.getLeft() - shape.getRight();
                LOG.trace(shape);//from www  .j  a v  a 2s .  c  om
                LOG.trace("Space : " + space);
                if (space < maxSpaceLog && space >= 0)
                    spaceCounts[space]++;
                if (space >= 0) {
                    spaces.add(space);
                    spaceMean.increment(space);
                    spaceStdDev.increment(space);
                }
            }
            previousShape = shape;
        } // next shape
    }

    for (int i = 0; i < maxSpaceLog; i++) {
        //LOG.debug("Space count " + i + ": " + spaceCounts[i]);
    }
    double spaceMeanVal = spaceMean.getResult();
    double spaceStdDevVal = spaceStdDev.getResult();
    LOG.debug("Space mean: " + spaceMeanVal);
    LOG.debug("Space std dev: " + spaceStdDevVal);

    // If however there is only a single word on the row, the
    // standard deviation will be very low.
    boolean singleWord = false;
    if (spaceStdDevVal * 2 < spaceMeanVal) {
        LOG.debug("Assuming a single word per row");
        singleWord = true;
    }

    // Since there should be two groups, one for letters and one for words,
    // the mean should be somewhere in between. We now look for the mean on the
    // lesser group and will use it as the basis for comparison.
    spaceMean = new Mean();
    spaceStdDev = new StandardDeviation();
    for (int space : spaces) {
        if (space < spaceMeanVal && space >= 0) {
            spaceMean.increment(space);
            spaceStdDev.increment(space);
        }
    }
    spaceMeanVal = spaceMean.getResult();
    spaceStdDevVal = spaceStdDev.getResult();
    LOG.debug("Letter space mean: " + spaceMeanVal);
    LOG.debug("Letter space std dev: " + spaceStdDevVal);

    int letterSpaceThreshold = 0;
    if (singleWord)
        letterSpaceThreshold = Integer.MAX_VALUE;
    else
        letterSpaceThreshold = (int) Math.round(spaceMeanVal + (4.0 * spaceStdDevVal));

    for (RowOfShapes row : rowCluster) {
        LOG.debug(row.toString());
        //row.getGroups().clear();
        row.organiseShapesInGroups(letterSpaceThreshold);
    } // next row      
}