Example usage for org.apache.commons.math.stat.descriptive DescriptiveStatistics DescriptiveStatistics

Introduction

In this page you can find the example usage for org.apache.commons.math.stat.descriptive DescriptiveStatistics DescriptiveStatistics.

Prototype

public DescriptiveStatistics()

Source Link

Document

Construct a DescriptiveStatistics instance with an infinite window

Usage

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

List<RowOfShapes> groupShapesIntoRows(SourceImage sourceImage, List<Shape> shapes, List<Rectangle> whiteAreas,
        boolean useSlope) {
    LOG.debug("########## groupShapesIntoRows #########");
    LOG.debug("useSlope? " + useSlope);

    List<RowOfShapes> rows = new ArrayList<RowOfShapes>();
    for (Shape shape : shapes)
        shape.setRow(null);//from   w w  w.j a  v a2s. c  o  m

    List<Shape> shapesToRemove = new ArrayList<Shape>();
    for (Shape shape : shapes) {
        for (Rectangle whiteArea : whiteAreas) {
            double whiteAreaRight = whiteArea.getRight();
            double whiteAreaLeft = whiteArea.getLeft();
            if (useSlope) {
                double xAdjustment = sourceImage.getXAdjustment(shape.getTop());

                whiteAreaRight += xAdjustment;
                whiteAreaLeft += xAdjustment;
            }

            if (whiteAreaRight > shape.getRight() && whiteAreaLeft < shape.getLeft()
                    && whiteArea.getTop() < shape.getTop() && whiteArea.getBottom() > shape.getBottom()) {
                // shape is surrounded
                shapesToRemove.add(shape);
                LOG.debug("Removing shape " + shape);
                LOG.debug("Surrounded by white area: " + whiteArea);
            }
        }
    }
    shapes.removeAll(shapesToRemove);

    // calculate the means
    // get average shape width & height
    DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics();
    for (Shape shape : shapes) {
        shapeWidthStats.addValue(shape.getWidth());
    }
    double averageShapeWidth = shapeWidthStats.getPercentile(50);
    LOG.debug("averageShapeWidth: " + averageShapeWidth);

    // now, arrange the shapes in rows
    // we're guaranteed that no two shapes overlap at this point.
    // Now, it's possible that two shapes in the same line have no vertical overlap (e.g. a comma and an apostrophe)
    // so we have to go searching a bit further afield, say five shapes in each direction
    // but if we go too far, we may end up joining two lines together if the page isn't quite straight

    // let's begin with any old shape and find the shapes closest to it horizontally
    // e.g. up to 8 horizontal means to the right and left
    // as we find shapes that go with it, we add them to the same line
    int i = 0;
    int j = 0;
    int numberOfMeanWidthsForSearch = 8;
    LOG.debug("numberOfMeanWidthsForSearch: " + numberOfMeanWidthsForSearch);
    LOG.debug("search distance: " + averageShapeWidth * numberOfMeanWidthsForSearch);

    for (Shape shape : shapes) {
        if (shape.getRow() == null) {
            RowOfShapes row = graphicsService.getEmptyRow(sourceImage);
            row.addShape(shape);
            row.setIndex(j++);
            rows.add(row);
            LOG.trace("========= New row " + row.getIndex() + "============");
            LOG.trace("Adding " + shape + " to row " + row.getIndex());
        }
        int searchLeft = (int) ((double) shape.getLeft() - (numberOfMeanWidthsForSearch * averageShapeWidth));
        int searchRight = (int) ((double) shape.getRight() + (numberOfMeanWidthsForSearch * averageShapeWidth));
        LOG.trace("Shape " + i++ + ": " + shape + "(row " + shape.getRow().getIndex() + ")");
        LOG.trace("searchLeft: " + searchLeft);
        LOG.trace("searchRight: " + searchRight);

        // construct an array to represent where white areas overlap with the search area
        int[][] leftSearchArea = new int[shape.getLeft() - searchLeft][2];
        int[][] rightSearchArea = new int[searchRight - shape.getRight()][2];
        for (int k = 0; k < leftSearchArea.length; k++) {
            leftSearchArea[k][0] = shape.getTop();
            leftSearchArea[k][1] = shape.getBottom();
        }
        for (int k = 0; k < rightSearchArea.length; k++) {
            rightSearchArea[k][0] = shape.getTop();
            rightSearchArea[k][1] = shape.getBottom();
        }

        int newSearchLeft = searchLeft;
        int newSearchRight = searchRight;
        for (Rectangle whiteArea : whiteAreas) {
            double whiteAreaRight = whiteArea.getRight();
            double whiteAreaLeft = whiteArea.getLeft();
            if (useSlope) {
                double xAdjustment = sourceImage.getXAdjustment(shape.getTop());

                whiteAreaRight += xAdjustment;
                whiteAreaLeft += xAdjustment;
                LOG.trace(whiteArea + ", xAdjustment=" + xAdjustment + " , whiteAreaLeft=" + whiteAreaLeft
                        + " , whiteAreaRight=" + whiteAreaRight);
            }

            if (whiteAreaRight > newSearchLeft && whiteAreaLeft < shape.getLeft()
                    && whiteArea.getTop() <= shape.getBottom() && whiteArea.getBottom() >= shape.getTop()) {

                LOG.trace("overlap on left with: " + whiteArea.toString());

                if (whiteArea.getTop() <= shape.getTop() && whiteArea.getBottom() >= shape.getBottom()
                        && whiteAreaRight > newSearchLeft) {
                    newSearchLeft = (int) Math.round(whiteAreaRight);
                    LOG.trace("Complete, newSearchLeft = " + newSearchLeft);
                } else {
                    LOG.trace("Partial, starting at " + whiteArea.getRight());
                    for (int k = whiteArea.getRight() - searchLeft; k >= 0; k--) {
                        if (k < leftSearchArea.length) {
                            if (whiteArea.getBottom() < shape.getBottom()
                                    && leftSearchArea[k][0] < whiteArea.getBottom())
                                leftSearchArea[k][0] = whiteArea.getBottom() + 1;
                            else if (whiteArea.getTop() > shape.getTop()
                                    && leftSearchArea[k][1] > whiteArea.getTop())
                                leftSearchArea[k][1] = whiteArea.getTop() - 1;

                            if (leftSearchArea[k][0] >= leftSearchArea[k][1]
                                    && searchLeft + k > newSearchLeft) {
                                newSearchLeft = searchLeft + k;
                                LOG.trace("Complete from " + newSearchLeft);
                                break;
                            }
                        }
                    }
                    //                  if (LOG.isTraceEnabled()) {
                    //                     StringBuilder sb = new StringBuilder();
                    //                     for (int k=0;k<leftSearchArea.length;k++) {
                    //                        String top = "" + (leftSearchArea[k][0]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", top)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                     sb = new StringBuilder();
                    //                     for (int k=0;k<leftSearchArea.length;k++) {
                    //                        String bottom = "" + (leftSearchArea[k][1]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", bottom)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                  }
                }
            } else if (whiteAreaLeft < newSearchRight && whiteAreaRight > shape.getRight()
                    && whiteArea.getTop() <= shape.getBottom() && whiteArea.getBottom() >= shape.getTop()) {
                LOG.trace("overlap on right with: " + whiteArea.toString());

                if (whiteArea.getTop() <= shape.getTop() && whiteArea.getBottom() >= shape.getBottom()
                        && newSearchRight > whiteAreaLeft) {
                    newSearchRight = (int) Math.round(whiteAreaLeft);
                    LOG.trace("Complete, newSearchRight = " + newSearchRight);

                } else {
                    LOG.trace("Partial, starting at " + whiteArea.getLeft());
                    for (int k = whiteArea.getLeft() - shape.getRight(); k < rightSearchArea.length; k++) {
                        if (k > 0 && k < leftSearchArea.length && k < rightSearchArea.length) {
                            if (whiteArea.getBottom() < shape.getBottom()
                                    && leftSearchArea[k][0] < whiteArea.getBottom())
                                rightSearchArea[k][0] = whiteArea.getBottom() + 1;
                            else if (whiteArea.getTop() > shape.getTop()
                                    && leftSearchArea[k][1] > whiteArea.getTop())
                                rightSearchArea[k][1] = whiteArea.getTop() - 1;

                            if (rightSearchArea[k][0] >= rightSearchArea[k][1]
                                    && newSearchRight > shape.getRight() + k) {
                                newSearchRight = shape.getRight() + k;
                                LOG.trace("Complete from " + newSearchRight);
                                break;
                            }
                        }
                    }
                    //                  if (LOG.isTraceEnabled()) {
                    //                     StringBuilder sb = new StringBuilder();
                    //                     for (int k=0;k<rightSearchArea.length;k++) {
                    //                        String top = "" + (rightSearchArea[k][0]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", top)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                     sb = new StringBuilder();
                    //                     for (int k=0;k<rightSearchArea.length;k++) {
                    //                        String bottom = "" + (rightSearchArea[k][1]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", bottom)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                  }
                }
            }
        }
        LOG.trace("searchLeft adjusted for white columns: " + newSearchLeft);
        LOG.trace("searchRight adjusted for white columns: " + newSearchRight);

        // min 10% overlap to assume same row
        double minOverlap = 0.10;

        for (Shape otherShape : shapes) {
            boolean haveSomeOverlap = false;
            if (!shape.getRow().equals(otherShape.getRow()) && !otherShape.equals(shape)) {

                // shapes are arranged from the top down
                if (otherShape.getTop() > shape.getBottom()) {
                    break;
                }

                if (otherShape.getRight() > newSearchLeft && otherShape.getRight() < shape.getLeft()
                        && otherShape.getTop() <= shape.getBottom()
                        && otherShape.getBottom() >= shape.getTop()) {
                    int k = otherShape.getRight() - searchLeft;
                    if (otherShape.getTop() <= leftSearchArea[k][1]
                            && otherShape.getBottom() >= leftSearchArea[k][0])
                        haveSomeOverlap = true;
                } else if (otherShape.getLeft() < newSearchRight && otherShape.getLeft() > shape.getRight()
                        && otherShape.getTop() <= shape.getBottom()
                        && otherShape.getBottom() >= shape.getTop()) {
                    int k = otherShape.getLeft() - shape.getRight();
                    if (otherShape.getTop() <= rightSearchArea[k][1]
                            && otherShape.getBottom() >= rightSearchArea[k][0])
                        haveSomeOverlap = true;
                }
                if (haveSomeOverlap) {
                    int overlap1 = shape.getBottom() - otherShape.getTop() + 1;
                    int overlap2 = otherShape.getBottom() - shape.getTop() + 1;
                    int overlap = overlap1 < overlap2 ? overlap1 : overlap2;
                    boolean addShapeToRow = false;
                    if ((((double) overlap / (double) shape.getHeight()) > minOverlap)
                            || (((double) overlap / (double) otherShape.getHeight()) > minOverlap)) {
                        addShapeToRow = true;
                    }

                    if (addShapeToRow) {
                        LOG.debug("Adding " + otherShape + " to row " + shape.getRow().getIndex());
                        if (otherShape.getRow() == null) {
                            shape.getRow().addShape(otherShape);
                        } else {
                            // two rows need to be merged
                            LOG.debug("========= Merge rows " + shape.getRow().getIndex() + " with "
                                    + otherShape.getRow().getIndex() + "==========");
                            RowOfShapes otherRow = otherShape.getRow();
                            shape.getRow().addShapes(otherRow.getShapes());
                            rows.remove(otherRow);
                        }
                    }
                } // add shape to row ?
            } // should shape be considered?
        } // next other shape
    } // next shape

    return rows;
}

From source file:com.joliciel.jochre.lexicon.LexiconErrorWriter.java

static void mergeCrossValidation(File evalDir, String prefix) {
    try {// ww  w  .  j  av a  2  s. c om
        File[] files = evalDir.listFiles(new FilenameFilter() {
            @Override
            public boolean accept(File dir, String name) {
                if (name.endsWith(".csv"))
                    return true;
                else
                    return false;
            }
        });
        List<String> groupNames = new ArrayList<String>();
        Map<String, Writer> writers = new HashMap<String, Writer>();
        Map<String, ErrorStatistics> errorMap = new LinkedHashMap<String, ErrorStatistics>();
        Map<String, Map<String, DescriptiveStatistics>> statMap = new HashMap<String, Map<String, DescriptiveStatistics>>();
        for (File file : files) {
            String filename = file.getName();
            LOG.debug("Processing " + filename);
            int index = Integer.parseInt(filename.substring(prefix.length(), prefix.length() + 1));
            String suffix = filename.substring(prefix.length() + 2, filename.lastIndexOf('_'));
            String fileType = filename.substring(filename.lastIndexOf('_') + 1, filename.lastIndexOf('.'));
            LOG.debug("Processing " + filename);
            LOG.debug("index: " + index);
            LOG.debug("suffix: " + suffix);
            LOG.debug("fileType: " + fileType);
            Writer writer = writers.get(fileType);
            boolean firstFile = false;
            if (writer == null) {
                writer = new BufferedWriter(new OutputStreamWriter(
                        new FileOutputStream(
                                new File(evalDir, prefix + "A_" + suffix + "_" + fileType + ".csv"), false),
                        "UTF8"));
                writers.put(fileType, writer);
                firstFile = true;
            }
            if (fileType.equals("KEMatrix")) {
                Scanner scanner = new Scanner(file);
                int i = 0;
                List<String> myGroupNames = new ArrayList<String>();
                Map<String, Boolean> haveCountMap = new HashMap<String, Boolean>();
                while (scanner.hasNextLine()) {
                    String line = scanner.nextLine();
                    List<String> cells = CSV.getCSVCells(line);
                    if (i == 0) {
                        for (int j = 0; j < cells.size(); j += 5) {
                            String groupName = cells.get(j);
                            if (!errorMap.containsKey(groupName)) {
                                errorMap.put(groupName, new ErrorStatistics());
                                statMap.put(groupName, new HashMap<String, DescriptiveStatistics>());
                                groupNames.add(groupName);
                            }
                            myGroupNames.add(groupName);
                        }
                    } else if (i == 1) {
                        // do nothing
                    } else {
                        String rowName = cells.get(0);
                        int j = 0;
                        for (String groupName : myGroupNames) {
                            ErrorStatistics errorStats = errorMap.get(groupName);
                            Map<String, DescriptiveStatistics> stats = statMap.get(groupName);
                            double correctCount = Double.parseDouble(cells.get(j * 5 + 1));
                            double errorCount = Double.parseDouble(cells.get(j * 5 + 2));
                            double totalCount = Double.parseDouble(cells.get(j * 5 + 3));
                            Boolean haveCount = haveCountMap.get(groupName);

                            if (rowName.equals("known")) {
                                errorStats.knownWordCorrectCount += correctCount;
                                errorStats.knownWordErrorCount += errorCount;
                            } else if (rowName.equals("unknown")) {
                                errorStats.unknownWordCorrectCount += correctCount;
                                errorStats.unknownWordErrorCount += errorCount;
                            } else if (rowName.equals("goodSeg")) {
                                errorStats.goodSegCorrectCount += correctCount;
                                errorStats.goodSegErrorCount += errorCount;
                            } else if (rowName.equals("badSeg")) {
                                errorStats.badSegCorrectCount += correctCount;
                                errorStats.badSegErrorCount += errorCount;
                            } else if (rowName.equals("knownLetters")) {
                                errorStats.knownWordCorrectLetterCount += correctCount;
                                errorStats.knownWordErrorLetterCount += errorCount;
                            } else if (rowName.equals("unknownLetters")) {
                                errorStats.unknownWordCorrectLetterCount += correctCount;
                                errorStats.unknownWordErrorLetterCount += errorCount;
                            } else if (rowName.equals("goodSegLetters")) {
                                errorStats.goodSegCorrectLetterCount += correctCount;
                                errorStats.goodSegErrorLetterCount += errorCount;
                            } else if (rowName.equals("badSegLetters")) {
                                errorStats.badSegCorrectLetterCount += correctCount;
                                errorStats.badSegErrorLetterCount += errorCount;
                            } else if (rowName.equals("inBeam")) {
                                errorStats.answerInBeamCorrectCount += correctCount;
                                errorStats.answerInBeamErrorCount += errorCount;
                            } else if (rowName.equals("total")) {
                                haveCountMap.put(groupName, totalCount > 0);
                            } else if (rowName.endsWith("%")) {
                                if (haveCount) {
                                    String keyPrefix = rowName.substring(0, rowName.length() - 1);
                                    String key = keyPrefix + "|correct";
                                    DescriptiveStatistics correctStat = stats.get(key);
                                    if (correctStat == null) {
                                        correctStat = new DescriptiveStatistics();
                                        stats.put(key, correctStat);
                                    }
                                    correctStat.addValue(correctCount);
                                    key = keyPrefix + "|error";
                                    DescriptiveStatistics errorStat = stats.get(key);
                                    if (errorStat == null) {
                                        errorStat = new DescriptiveStatistics();
                                        stats.put(key, errorStat);
                                    }
                                    errorStat.addValue(errorCount);
                                    key = keyPrefix + "|total";
                                    DescriptiveStatistics totalStat = stats.get(key);
                                    if (totalStat == null) {
                                        totalStat = new DescriptiveStatistics();
                                        stats.put(key, totalStat);
                                    }
                                    totalStat.addValue(totalCount);
                                }
                            }

                            j++;
                        }
                    }
                    i++;
                }
            } else {
                Scanner scanner = new Scanner(file);
                boolean firstLine = true;
                while (scanner.hasNextLine()) {
                    String line = scanner.nextLine();
                    if (firstLine) {
                        if (firstFile)
                            writer.write(line + "\n");
                        firstLine = false;
                    } else {
                        writer.write(line + "\n");
                    }
                    writer.flush();
                }
            } // file type
        } // next file

        Writer statsWriter = writers.get("KEMatrix");
        writeStats(statsWriter, errorMap);
        statsWriter.write("\n");
        String[] statTypes = new String[] { "known", "unknown", "goodSeg", "badSeg", "inBeam", "total",
                "knownLetter", "unknownLetter", "goodSegLetter", "badSegLetter", "totalLetter" };
        for (String statType : statTypes) {
            for (String groupName : groupNames) {
                Map<String, DescriptiveStatistics> statsMap = statMap.get(groupName);
                DescriptiveStatistics correctStat = statsMap.get(statType + "|correct");
                DescriptiveStatistics errorStat = statsMap.get(statType + "|error");
                DescriptiveStatistics totalStat = statsMap.get(statType + "|total");

                statsWriter.write(CSV.format(statType + "%Avg") + CSV.format(correctStat.getMean())
                        + CSV.format(errorStat.getMean()) + CSV.format(totalStat.getMean())
                        + CSV.getCsvSeparator());

            } // next group
            statsWriter.write("\n");
            for (String groupName : groupNames) {
                Map<String, DescriptiveStatistics> statsMap = statMap.get(groupName);
                DescriptiveStatistics correctStat = statsMap.get(statType + "|correct");
                DescriptiveStatistics errorStat = statsMap.get(statType + "|error");
                DescriptiveStatistics totalStat = statsMap.get(statType + "|total");

                statsWriter.write(CSV.format(statType + "%Dev") + CSV.format(correctStat.getStandardDeviation())
                        + CSV.format(errorStat.getStandardDeviation())
                        + CSV.format(totalStat.getStandardDeviation()) + CSV.getCsvSeparator());

            } // next group
            statsWriter.write("\n");
            statsWriter.flush();
        }
        statsWriter.close();

    } catch (IOException e) {
        LogUtils.logError(LOG, e);
        throw new RuntimeException(e);
    }
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * We attempt to remove specks, where a speck is defined as 
 * a relatively small shape at a relatively large distance from other shapes.
 * @param sourceImage//from  ww  w. j a va 2s .  co  m
 */
void removeSpecks(SourceImage sourceImage, List<Shape> shapes) {
    LOG.debug("########## removeSpecks #########");

    DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics();
    DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics();

    for (Shape shape : shapes) {
        shapeWidthStats.addValue(shape.getWidth());
        shapeHeightStats.addValue(shape.getHeight());
    }

    double shapeWidthMedian = shapeWidthStats.getPercentile(65);
    double shapeHeightMedian = shapeHeightStats.getPercentile(65);
    LOG.debug("meanShapeWidth: " + shapeWidthMedian);
    LOG.debug("meanShapeHeight: " + shapeHeightMedian);

    int maxSpeckHeightFloor = (int) Math.ceil(shapeHeightMedian / 6.0);
    int maxSpeckWidthFloor = (int) Math.ceil(shapeWidthMedian / 6.0);
    int maxSpeckHeightCeiling = maxSpeckHeightFloor * 2;
    int maxSpeckWidthCeiling = maxSpeckWidthFloor * 2;

    int speckXDistanceThresholdFloor = (int) Math.floor(shapeWidthMedian);
    int speckYDistanceThresholdFloor = (int) Math.floor(shapeHeightMedian / 4.0);
    int speckXDistanceThresholdCeiling = speckXDistanceThresholdFloor * 2;
    int speckYDistanceThresholdCeiling = speckYDistanceThresholdFloor * 2;

    LOG.debug("maxSpeckHeightFloor=" + maxSpeckHeightFloor);
    LOG.debug("maxSpeckWidthFloor=" + maxSpeckWidthFloor);
    LOG.debug("speckXDistanceThresholdFloor=" + speckXDistanceThresholdFloor);
    LOG.debug("speckYDistanceThresholdFloor=" + speckYDistanceThresholdFloor);
    LOG.debug("maxSpeckHeightCeiling=" + maxSpeckHeightCeiling);
    LOG.debug("maxSpeckWidthCeiling=" + maxSpeckWidthCeiling);
    LOG.debug("speckXDistanceThresholdCeiling=" + speckXDistanceThresholdCeiling);
    LOG.debug("speckYDistanceThresholdCeiling=" + speckYDistanceThresholdCeiling);

    List<Shape> specks = new ArrayList<Shape>();
    List<double[]> speckCoordinates = new ArrayList<double[]>();
    for (Shape shape : shapes) {
        if (shape.getHeight() < maxSpeckHeightCeiling && shape.getWidth() < maxSpeckWidthCeiling) {
            specks.add(shape);
            speckCoordinates.add(shape.getCentrePoint());
        }
    }

    // group the specks into clusters, which will be added or removed as a whole
    // Note that a cluster could be a valid diacritic that's split into a few specks
    // or just a bunch of specks off on their own
    DBSCANClusterer<Shape> clusterer = new DBSCANClusterer<Shape>(specks, speckCoordinates);
    Set<Set<Shape>> speckClusters = clusterer.cluster(speckXDistanceThresholdFloor, 2, true);
    List<Shape> specksToRemove = new ArrayList<Shape>();
    for (Set<Shape> speckCluster : speckClusters) {

        int speckHeight = 0;
        int speckWidth = 0;
        int clusterTop = -1;
        int clusterBottom = -1;
        int clusterRight = -1;
        int clusterLeft = -1;
        for (Shape speck : speckCluster) {
            LOG.debug("Speck?, " + speck);
            if (speck.getWidth() > speckWidth)
                speckWidth = speck.getWidth();
            if (speck.getHeight() > speckHeight)
                speckHeight = speck.getHeight();

            if (clusterTop < 0 || speck.getTop() < clusterTop)
                clusterTop = speck.getTop();
            if (clusterLeft < 0 || speck.getLeft() < clusterLeft)
                clusterLeft = speck.getLeft();
            if (speck.getBottom() > clusterBottom)
                clusterBottom = speck.getBottom();
            if (speck.getRight() > clusterRight)
                clusterRight = speck.getRight();

        }

        boolean useWidth = speckWidth > speckHeight;
        double scale = 1.0;
        if (useWidth)
            scale = speckWidth < maxSpeckWidthFloor ? 0.0
                    : (speckWidth > maxSpeckWidthCeiling ? 1.0
                            : ((double) speckWidth - maxSpeckWidthFloor)
                                    / (maxSpeckWidthCeiling - maxSpeckWidthFloor));
        else
            scale = speckHeight < maxSpeckHeightFloor ? 0.0
                    : (speckHeight > maxSpeckHeightCeiling ? 1.0
                            : ((double) speckHeight - maxSpeckHeightFloor)
                                    / (maxSpeckHeightCeiling - maxSpeckHeightFloor));

        int speckXDistanceThreshold = (int) Math.ceil(speckXDistanceThresholdFloor
                + scale * (speckXDistanceThresholdCeiling - speckXDistanceThresholdFloor));
        int speckYDistanceThreshold = (int) Math.ceil(speckYDistanceThresholdFloor
                + scale * (speckYDistanceThresholdCeiling - speckYDistanceThresholdFloor));

        LOG.debug("speckHeight=" + speckHeight);
        LOG.debug("speckWidth=" + speckWidth);
        LOG.debug("speckXDistanceThreshold=" + speckXDistanceThreshold);
        LOG.debug("speckYDistanceThreshold=" + speckYDistanceThreshold);

        Shape nearestShape = null;
        double minDistance = 0.0;
        int nearestShapeXDiff = 0;
        int nearestShapeYDiff = 0;

        for (Shape otherShape : shapes) {
            // limit to nearby shapes
            if (otherShape.getTop() > clusterBottom + speckYDistanceThreshold + 1)
                break;
            if (otherShape.getBottom() < clusterTop - speckYDistanceThreshold - 1)
                continue;
            if (otherShape.getRight() < clusterLeft - speckXDistanceThreshold - 1)
                continue;
            if (otherShape.getLeft() > clusterRight + speckXDistanceThreshold + 1)
                continue;

            // Note: tried !specks.contains(otherShape), but sometimes we have a valid case
            // where a diacritic is "split" into two specks
            if (!specks.contains(otherShape)) {
                int xDiff = 0;
                int yDiff = 0;
                int leftDiff = 0;
                int rightDiff = 0;
                int topDiff = 0;
                int botDiff = 0;

                if (otherShape.getLeft() <= clusterRight && otherShape.getRight() >= clusterLeft) {
                    xDiff = 0;
                } else {
                    leftDiff = Math.abs(clusterLeft - otherShape.getRight());
                    rightDiff = Math.abs(clusterRight - otherShape.getLeft());
                    xDiff = (leftDiff < rightDiff) ? leftDiff : rightDiff;
                }

                if (otherShape.getTop() <= clusterBottom && otherShape.getBottom() >= clusterTop) {
                    yDiff = 0;
                } else {
                    int nearestTop = (otherShape.getTop() > otherShape.getTop() + otherShape.getMeanLine())
                            ? otherShape.getTop() + otherShape.getMeanLine()
                            : otherShape.getTop();
                    int nearestBot = (otherShape.getBottom() < otherShape.getTop() + otherShape.getBaseLine())
                            ? otherShape.getTop() + otherShape.getBaseLine()
                            : otherShape.getBottom();
                    topDiff = Math.abs(clusterTop - nearestBot);
                    botDiff = Math.abs(clusterBottom - nearestTop);
                    yDiff = (topDiff < botDiff) ? topDiff : botDiff;
                }

                double distance = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff));

                if (nearestShape == null || distance < minDistance) {
                    nearestShape = otherShape;
                    minDistance = distance;
                    nearestShapeXDiff = xDiff;
                    nearestShapeYDiff = yDiff;
                    LOG.trace("leftDiff=" + leftDiff + ", rightDiff=" + rightDiff);
                    LOG.trace("topDiff=" + topDiff + ", botDiff=" + botDiff);
                } // found closer shape?
            } // is this the speck?
        } // loop shapes around the reference shape

        if (nearestShape != null) {
            LOG.trace("Nearest shape, top(" + nearestShape.getTop() + ") " + "left(" + nearestShape.getLeft()
                    + ") " + "bot(" + nearestShape.getBottom() + ") " + "right(" + nearestShape.getRight()
                    + ")");
            LOG.trace("Distance=" + minDistance + ", xDiff=" + nearestShapeXDiff + ", yDiff="
                    + nearestShapeYDiff);
        }
        boolean removeSpecks = false;
        if (nearestShape == null)
            removeSpecks = true;
        else {
            // calculate the shortest distance from the nearest shape to the speck cluster
            for (Shape speck : speckCluster) {
                int xDiff = 0;
                int yDiff = 0;
                int leftDiff = 0;
                int rightDiff = 0;
                int topDiff = 0;
                int botDiff = 0;

                if (nearestShape.getLeft() <= speck.getRight() && nearestShape.getRight() >= speck.getLeft()) {
                    xDiff = 0;
                } else {
                    leftDiff = Math.abs(speck.getLeft() - nearestShape.getRight());
                    rightDiff = Math.abs(speck.getRight() - nearestShape.getLeft());
                    xDiff = (leftDiff < rightDiff) ? leftDiff : rightDiff;
                }

                if (nearestShape.getTop() <= speck.getBottom() && nearestShape.getBottom() >= speck.getTop()) {
                    yDiff = 0;
                } else {
                    int nearestTop = (nearestShape.getTop() > nearestShape.getTop()
                            + nearestShape.getMeanLine()) ? nearestShape.getTop() + nearestShape.getMeanLine()
                                    : nearestShape.getTop();
                    int nearestBot = (nearestShape.getBottom() < nearestShape.getTop()
                            + nearestShape.getBaseLine()) ? nearestShape.getTop() + nearestShape.getBaseLine()
                                    : nearestShape.getBottom();
                    topDiff = Math.abs(speck.getTop() - nearestBot);
                    botDiff = Math.abs(speck.getBottom() - nearestTop);
                    yDiff = (topDiff < botDiff) ? topDiff : botDiff;
                }

                double distance = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff));

                if (distance < minDistance) {
                    minDistance = distance;
                    nearestShapeXDiff = xDiff;
                    nearestShapeYDiff = yDiff;
                    LOG.debug("Found closer speck:");
                    LOG.debug("leftDiff=" + leftDiff + ", rightDiff=" + rightDiff);
                    LOG.debug("topDiff=" + topDiff + ", botDiff=" + botDiff);
                } // found closer shape?
            }
            // Then, for all of these specks, find the one that's closest to the nearest non-speck
            // if this distance > threshold, get rid of all of 'em
            // otherwise, keep 'em all
            if (nearestShapeXDiff > speckXDistanceThreshold || nearestShapeYDiff > speckYDistanceThreshold)
                removeSpecks = true;
        }
        if (removeSpecks) {
            for (Shape otherSpeck : speckCluster) {
                LOG.debug("Removing speck " + otherSpeck);
                specksToRemove.add(otherSpeck);
            }
        }
    } // next speck

    shapes.removeAll(specksToRemove);
}

From source file:com.facebook.presto.AbstractTestQueries.java

@Test
public void testTableSampleBernoulli() throws Exception {
    DescriptiveStatistics stats = new DescriptiveStatistics();

    int total = computeExpected("SELECT orderkey FROM orders", TupleInfo.SINGLE_LONG).getMaterializedTuples()
            .size();/*from w  w w  .  ja  va2 s.  c om*/

    for (int i = 0; i < 100; i++) {
        List<MaterializedTuple> values = computeActual("SELECT orderkey FROM ORDERS TABLESAMPLE BERNOULLI (50)")
                .getMaterializedTuples();

        assertEquals(values.size(), ImmutableSet.copyOf(values).size(), "TABLESAMPLE produced duplicate rows");
        stats.addValue(values.size() * 1.0 / total);
    }

    double mean = stats.getGeometricMean();
    assertTrue(mean > 0.45 && mean < 0.55,
            String.format("Expected mean sampling rate to be ~0.5, but was %s", mean));
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * Clear out anything found in the right & left margins
 * @param sourceImage//from   w ww.  j a v a2s  . c o m
 */
void cleanMargins(SourceImage sourceImage) {
    LOG.debug("########## cleanMargins #########");

    int minCardinalityForMargin = 8;
    double averageShapeWidth = sourceImage.getAverageShapeWidth();

    LOG.debug("Finding right margin");
    double rightLimit = (double) sourceImage.getWidth() * 0.67;

    // first, create a DBScan cluster of all rows near the right-hand side
    List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>();
    List<double[]> rightCoordinates = new ArrayList<double[]>();

    for (RowOfShapes row : sourceImage.getRows()) {
        double right = row.getRight();
        if (right >= rightLimit) {
            LOG.trace(row.toString());
            LOG.trace(
                    "Right: " + right + " + " + row.getXAdjustment() + " = " + (right - row.getXAdjustment()));
            right -= row.getXAdjustment();
            rightHandRows.add(row);
            rightCoordinates.add(new double[] { right });
        }
    }

    DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows,
            rightCoordinates);
    Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(averageShapeWidth, minCardinalityForMargin,
            true);

    TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>(
            new CardinalityComparator<RowOfShapes>());
    orderedRowClusters.addAll(rowClusters);

    int i = 0;

    // find the right-most cluster with sufficient cardinality, and assume it's the right margin
    DescriptiveStatistics rightMarginStats = null;
    for (Set<RowOfShapes> cluster : orderedRowClusters) {
        DescriptiveStatistics rightStats = new DescriptiveStatistics();
        for (RowOfShapes row : cluster)
            rightStats.addValue(row.getRight() - row.getXAdjustment());

        LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
        LOG.debug("Right mean : " + rightStats.getMean());
        LOG.debug("Right std dev: " + rightStats.getStandardDeviation());

        if (cluster.size() >= minCardinalityForMargin
                && (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean())) {
            rightMarginStats = rightStats;
        }
        i++;
    }

    // see how many rows would violate this margin - if too many, assume no margin
    // these rows are only rows which extend across the margin
    if (rightMarginStats != null) {
        LOG.debug("Right margin mean : " + rightMarginStats.getMean());
        LOG.debug("Right margin std dev: " + rightMarginStats.getStandardDeviation());

        double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth();
        LOG.debug("rightMarginLimit: " + rightMarginLimit);
        int numRowsToChop = 0;
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getRight() >= rightLimit) {
                if (row.getRight() - row.getXAdjustment() >= rightMarginLimit
                        && row.getLeft() - row.getXAdjustment() <= rightMarginLimit) {
                    LOG.debug("Found overlapping row : " + row);
                    LOG.debug("Adjusted right : " + (row.getRight() - row.getXAdjustment()));
                    numRowsToChop++;
                }
            }
        }
        if (numRowsToChop >= 3) {
            LOG.debug("Too many overlapping rows - ignoring margin");
            rightMarginStats = null;
        }
    }

    if (rightMarginStats != null) {
        double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth();
        List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            double right = row.getRight() - row.getXAdjustment();
            LOG.trace(row.toString());
            LOG.trace("Adjusted right: " + right);

            if (right >= rightMarginLimit) {
                LOG.trace("Has out-of-margin stuff!");
                // need to chop off groups to the right of this threshold
                List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>();
                for (GroupOfShapes group : row.getGroups()) {
                    if (group.getLeft() - row.getXAdjustment() > rightMarginLimit) {
                        groupsToChop.add(group);
                        LOG.debug("Chopping group outside of right margin: " + group);
                    }
                }
                for (GroupOfShapes group : groupsToChop) {
                    row.getShapes().removeAll(group.getShapes());
                }
                row.getGroups().removeAll(groupsToChop);

                if (row.getGroups().size() == 0) {
                    LOG.debug("Removing empty " + row);
                    rowsToRemove.add(row);
                } else {
                    row.recalculate();
                    row.assignGuideLines();
                }
            } // does this row extend beyond the margin?
        } // next row
        sourceImage.getRows().removeAll(rowsToRemove);
    } // have a right margin

    LOG.debug("Finding left margin");
    double leftLimit = (double) sourceImage.getWidth() * 0.33;

    // first, create a DBScan cluster of all rows near the left-hand side
    List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>();
    List<double[]> leftCoordinates = new ArrayList<double[]>();

    for (RowOfShapes row : sourceImage.getRows()) {
        double left = row.getLeft();
        if (left <= leftLimit) {
            LOG.trace(row.toString());
            LOG.trace("Left: " + left + " - " + row.getXAdjustment() + " = " + (left - row.getXAdjustment()));
            left -= row.getXAdjustment();
            leftHandRows.add(row);
            leftCoordinates.add(new double[] { left });
        }
    }

    DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows,
            leftCoordinates);
    Set<Set<RowOfShapes>> rowClustersLeft = leftMarginClusterer.cluster(averageShapeWidth,
            minCardinalityForMargin, true);

    TreeSet<Set<RowOfShapes>> orderedRowClustersLeft = new TreeSet<Set<RowOfShapes>>(
            new CardinalityComparator<RowOfShapes>());
    orderedRowClustersLeft.addAll(rowClustersLeft);

    i = 0;

    // find the left-most cluster with sufficient cardinality, and assume it's the left margin
    DescriptiveStatistics leftMarginStats = null;
    for (Set<RowOfShapes> cluster : orderedRowClustersLeft) {
        DescriptiveStatistics leftStats = new DescriptiveStatistics();
        for (RowOfShapes row : cluster)
            leftStats.addValue(row.getLeft() - row.getXAdjustment());

        LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
        LOG.debug("Left mean : " + leftStats.getMean());
        LOG.debug("Left std dev: " + leftStats.getStandardDeviation());

        if (cluster.size() >= minCardinalityForMargin
                && (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean())) {
            leftMarginStats = leftStats;
        }
        i++;
    }

    // see how many rows would violate this margin - if too many, assume no margin
    // these rows are only rows which extend across the margin
    if (leftMarginStats != null) {
        LOG.debug("Left margin mean : " + leftMarginStats.getMean());
        LOG.debug("Left margin std dev: " + leftMarginStats.getStandardDeviation());

        double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth();
        LOG.debug("leftMarginLimit: " + leftMarginLimit);
        int numRowsToChop = 0;
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getLeft() <= leftLimit) {
                if (row.getLeft() - row.getXAdjustment() <= leftMarginLimit
                        && row.getRight() - row.getXAdjustment() >= leftMarginLimit) {
                    LOG.debug("Found overlapping row : " + row);
                    LOG.debug("Adjusted left : " + (row.getLeft() - row.getXAdjustment()));
                    numRowsToChop++;
                }
            }
        }
        if (numRowsToChop >= 3) {
            LOG.debug("Too many overlapping rows - ignoring margin");
            leftMarginStats = null;
        }
    }

    if (leftMarginStats != null) {
        double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth();
        List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            double left = row.getLeft() - row.getXAdjustment();
            LOG.trace(row.toString());
            LOG.trace("Adjusted left: " + left);

            if (left <= leftMarginLimit) {
                LOG.trace("Has out-of-margin stuff!");
                // need to chop off groups to the left of this threshold
                List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>();
                for (GroupOfShapes group : row.getGroups()) {
                    if (group.getRight() - row.getXAdjustment() < leftMarginLimit) {
                        groupsToChop.add(group);
                        LOG.debug("Chopping group outside of left margin: " + group);
                    }
                }
                for (GroupOfShapes group : groupsToChop) {
                    row.getShapes().removeAll(group.getShapes());
                }
                row.getGroups().removeAll(groupsToChop);

                if (row.getGroups().size() == 0) {
                    LOG.debug("Removing empty " + row);
                    rowsToRemove.add(row);
                } else {
                    row.recalculate();
                    row.assignGuideLines();
                }
            } // does this row extend beyond the margin?
        } // next row
        sourceImage.getRows().removeAll(rowsToRemove);
    } // have a left margin
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * Detects paragraph splits and assign rows to correct paragraphs.
 * @param sourceImage//from  w w w  .  jav a 2 s  .  c o m
 */
void groupRowsIntoParagraphs(SourceImage sourceImage) {
    LOG.debug("########## groupRowsIntoParagraphs #########");
    // We'll use various possible indicators, including
    // indented start, indented end, and spacing between rows.

    // On pages with a single big paragraph makes it hypersensitive to differences in row-start/row-end
    // This means we cannot use deviation. Instead, we use the average shape width on the page.
    // We also adjust maxLeft & minRight to match the vertical line slope

    // This is now complicated by the possibility of multiple columns

    // Need to take into account a big horizontal space - Pietrushka page 14
    // Find horizontal spaces that go all the way across and are wider than a certain threshold
    // simply do a boolean column and black out everything in a row, than see if there are any remaining spaces above a certain threshold
    // Columns are thus arranged into "areas", separated by white-space.
    boolean[] fullRows = new boolean[sourceImage.getHeight()];
    for (RowOfShapes row : sourceImage.getRows()) {
        for (int y = row.getTop(); y <= row.getBottom(); y++) {
            fullRows[y] = true;
        }
    }
    DescriptiveStatistics rowHeightStats = new DescriptiveStatistics();

    for (RowOfShapes row : sourceImage.getRows()) {
        int height = row.getXHeight();
        rowHeightStats.addValue(height);
    }
    double avgRowHeight = rowHeightStats.getPercentile(50);
    LOG.debug("meanRowHeight: " + avgRowHeight);
    double minHeightForWhiteSpace = avgRowHeight * 1.3;
    LOG.debug("minHeightForWhiteSpace: " + minHeightForWhiteSpace);

    // find the "white rows" - any horizontal white space
    // in the page which is sufficiently high
    List<int[]> whiteRows = new ArrayList<int[]>();
    boolean inWhite = false;
    int startWhite = 0;
    for (int y = 0; y < sourceImage.getHeight(); y++) {
        if (!inWhite && !fullRows[y]) {
            inWhite = true;
            startWhite = y;
        } else if (inWhite && fullRows[y]) {
            int length = y - startWhite;
            if (length > minHeightForWhiteSpace) {
                LOG.debug("Adding whiteRow " + startWhite + "," + (y - 1));
                whiteRows.add(new int[] { startWhite, y - 1 });
            }
            inWhite = false;
        }
    }
    if (inWhite)
        whiteRows.add(new int[] { startWhite, sourceImage.getHeight() - 1 });
    whiteRows.add(new int[] { sourceImage.getHeight(), sourceImage.getHeight() });

    // place rows in "areas" defined by the "white rows" found above
    List<List<RowOfShapes>> areas = new ArrayList<List<RowOfShapes>>();
    int startY = -1;
    for (int[] whiteRow : whiteRows) {
        List<RowOfShapes> area = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getTop() >= startY && row.getBottom() <= whiteRow[0]) {
                area.add(row);
            }
        }
        if (area.size() > 0) {
            areas.add(area);
        }
        startY = whiteRow[1];
    }

    // break up each area into vertical columns
    LOG.debug("break up each area into vertical columns");
    List<Column> columns = new ArrayList<Column>();
    List<List<Column>> columnsPerAreaList = new ArrayList<List<Column>>();
    for (List<RowOfShapes> area : areas) {
        LOG.debug("Next area");
        List<Column> columnsPerArea = new ArrayList<SegmenterImpl.Column>();
        columnsPerAreaList.add(columnsPerArea);
        TreeSet<RowOfShapes> rows = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator());
        rows.addAll(area);
        for (RowOfShapes row : rows) {
            // try to place this row in one of the columns directly above it.
            // this means that a row which overlaps more than one column has to "close" this column, so it is no longer considered
            List<Column> overlappingColumns = new ArrayList<Column>();
            for (Column column : columnsPerArea) {
                if (!column.closed) {
                    RowOfShapes lastRowInColumn = column.get(column.size() - 1);
                    if (row.getRight() - row.getXAdjustment() >= lastRowInColumn.getLeft()
                            - lastRowInColumn.getXAdjustment()
                            && row.getLeft() - row.getXAdjustment() <= lastRowInColumn.getRight()
                                    - lastRowInColumn.getXAdjustment()) {
                        overlappingColumns.add(column);
                    }
                }
            }
            if (overlappingColumns.size() == 1) {
                Column myColumn = overlappingColumns.get(0);
                RowOfShapes lastRowInMyColumn = myColumn.get(0);

                // close any columns that are now at a distance of more than one row
                for (Column column : columnsPerArea) {
                    if (!column.closed && !column.equals(myColumn)) {
                        RowOfShapes lastRowInColumn = column.get(column.size() - 1);
                        if (lastRowInMyColumn.getTop() > lastRowInColumn.getBottom()) {
                            column.closed = true;
                            LOG.debug("Closing distant column " + lastRowInColumn);
                        }
                    }
                }

                myColumn.add(row);
                LOG.debug(row.toString());
                LOG.debug("  added to column " + lastRowInMyColumn);
            } else {
                for (Column overlappingColumn : overlappingColumns) {
                    overlappingColumn.closed = true;
                    RowOfShapes lastRowInColumn = overlappingColumn.get(overlappingColumn.size() - 1);
                    LOG.debug("Closing overlapping column " + lastRowInColumn);
                }
                Column myColumn = new Column(sourceImage);
                myColumn.add(row);
                LOG.debug("Found new column");
                LOG.debug(row.toString());
                columns.add(myColumn);
                columnsPerArea.add(myColumn);
            }
        }
    } // next area

    for (Column column : columns)
        column.recalculate();

    // Intermediate step to reform the vertical columns, if they exist
    // basically the idea is that if the columns are aligned vertically, then the thresholds for paragraph indents
    // should be shared, to increase the statistical sample size and reduce anomalies.
    // We'll assume that two columns from two consecutive areas are in the same vertical group if they overlap with each other horizontally
    // and don't overlap with any other column in the other column's area.
    List<List<Column>> columnGroups = new ArrayList<List<Column>>();
    List<Column> columnsInPrevArea = null;
    for (List<Column> columnsPerArea : columnsPerAreaList) {
        if (columnsInPrevArea != null) {
            for (Column prevColumn : columnsInPrevArea) {
                LOG.debug("Checking " + prevColumn);
                // find the column group containing the previous column
                List<Column> myColumnGroup = null;
                for (List<Column> columnGroup : columnGroups) {
                    if (columnGroup.contains(prevColumn)) {
                        myColumnGroup = columnGroup;
                        break;
                    }
                }
                if (myColumnGroup == null) {
                    myColumnGroup = new ArrayList<SegmenterImpl.Column>();
                    LOG.debug("Creating column group for column " + prevColumn.toString());
                    columnGroups.add(myColumnGroup);
                    myColumnGroup.add(prevColumn);
                }

                // does only one column overlap with this one?
                Column overlappingColumn = null;
                for (Column column : columnsPerArea) {
                    if (column.adjustedRight >= prevColumn.adjustedLeft
                            && column.adjustedLeft <= prevColumn.adjustedRight) {
                        if (overlappingColumn == null) {
                            LOG.debug("I overlap with " + column);

                            overlappingColumn = column;
                        } else {
                            LOG.debug("But I overlap also with " + column);

                            overlappingColumn = null;
                            break;
                        }
                    }
                }
                if (overlappingColumn != null) {
                    // does it overlap with only me?
                    for (Column otherPrevColumn : columnsInPrevArea) {
                        if (otherPrevColumn.equals(prevColumn))
                            continue;
                        if (overlappingColumn.adjustedRight >= otherPrevColumn.adjustedLeft
                                && overlappingColumn.adjustedLeft <= otherPrevColumn.adjustedRight) {
                            LOG.debug("But it overlaps also with " + otherPrevColumn);
                            overlappingColumn = null;
                            break;
                        }
                    }
                }
                if (overlappingColumn != null) {
                    myColumnGroup.add(overlappingColumn);
                    LOG.debug("Adding " + overlappingColumn);
                    LOG.debug(" to group with " + prevColumn);
                }

            } // next previous column
        } // have previous columns
        columnsInPrevArea = columnsPerArea;
    } // next area
    if (columnsInPrevArea != null) {
        for (Column prevColumn : columnsInPrevArea) {
            // find the column group containing the previous column
            List<Column> myColumnGroup = null;
            for (List<Column> columnGroup : columnGroups) {
                if (columnGroup.contains(prevColumn)) {
                    myColumnGroup = columnGroup;
                    break;
                }
            }
            if (myColumnGroup == null) {
                myColumnGroup = new ArrayList<SegmenterImpl.Column>();
                LOG.debug("Creating column group for column " + prevColumn.toString());
                columnGroups.add(myColumnGroup);
                myColumnGroup.add(prevColumn);
            }
        }
    }

    // What we really want here is, for each column (in the case of right-to-left),
    // two clusters on the right
    // and one relatively big cluster on the left.
    // anything outside of the cluster on the left is an EOP.
    boolean hasTab = false;
    for (List<Column> columnGroup : columnGroups) {
        LOG.debug("Next column group");
        double averageShapeWidth = sourceImage.getAverageShapeWidth();
        LOG.debug("averageShapeWidth: " + averageShapeWidth);
        double epsilon = averageShapeWidth / 2.0;
        LOG.debug("epsilon: " + epsilon);

        int columnGroupTop = sourceImage.getHeight();
        int columnGroupBottom = 0;
        int columnGroupLeft = sourceImage.getWidth();
        int columnGroupRight = 0;
        for (Column column : columnGroup) {
            if (column.top < columnGroupTop)
                columnGroupTop = (int) Math.round(column.top);
            if (column.bottom > columnGroupBottom)
                columnGroupBottom = (int) Math.round(column.bottom);
            if (column.adjustedLeft < columnGroupLeft)
                columnGroupLeft = (int) Math.round(column.adjustedLeft);
            if (column.adjustedRight > columnGroupRight)
                columnGroupRight = (int) Math.round(column.adjustedRight);
        }

        // right thresholds
        LOG.debug("Calculating right thresholds");

        // first, create a DBScan cluster of all rows by their adjusted right coordinate
        List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>();
        List<double[]> rightCoordinates = new ArrayList<double[]>();

        for (Column column : columnGroup) {
            for (RowOfShapes row : column) {
                double right = row.getRight() - row.getXAdjustment();
                //               double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage);
                //               if (rightOverlap==0) {
                //                  // leave out any right-overlapping rows here
                //                  // since we need accurate statistics for margin detection
                //               // This is questionable - especially since a long vertical bar (see Petriushka)
                //               // tends to give all rows a left overlap. Also, because the overlap is calculated based
                //               // on the mean right & mean left, not based on any sort of margin clusters.
                //                  rightHandRows.add(row);
                //                  rightCoordinates.add(new double[] {right});
                //               }
                rightHandRows.add(row);
                rightCoordinates.add(new double[] { right });

            }
        }

        int minCardinalityForRightMargin = 5;
        DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows,
                rightCoordinates);
        Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(epsilon, minCardinalityForRightMargin,
                true);

        TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>(
                new CardinalityComparator<RowOfShapes>());
        orderedRowClusters.addAll(rowClusters);

        int i = 0;

        // find the two right-most clusters, and assume they are the margin & the tab
        DescriptiveStatistics rightMarginStats = null;
        DescriptiveStatistics rightTabStats = null;
        for (Set<RowOfShapes> cluster : orderedRowClusters) {
            DescriptiveStatistics rightStats = new DescriptiveStatistics();
            MeanAbsoluteDeviation rightDev = new MeanAbsoluteDeviation();
            for (RowOfShapes row : cluster) {
                int rowIndex = rightHandRows.indexOf(row);
                double right = rightCoordinates.get(rowIndex)[0];
                rightStats.addValue(right);
                rightDev.increment(right);
            }

            LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
            LOG.debug("Right mean : " + rightStats.getMean());
            LOG.debug("Right dev: " + rightDev.getResult());

            if (cluster.size() >= minCardinalityForRightMargin) {
                if (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean()) {
                    if (rightMarginStats != null)
                        rightTabStats = rightMarginStats;
                    rightMarginStats = rightStats;
                } else if (rightTabStats == null || rightTabStats.getMean() < rightStats.getMean()) {
                    rightTabStats = rightStats;
                }
            } else {
                break;
            }
            i++;
        } // next right-coordinate cluster

        double rightMargin = sourceImage.getWidth();
        double rightTab = sourceImage.getWidth();
        if (rightMarginStats != null) {
            rightMargin = rightMarginStats.getMean();
        } else {
            List<Rectangle> columnSeparators = sourceImage.findColumnSeparators();
            for (Rectangle columnSeparator : columnSeparators) {
                if (columnSeparator.getTop() <= columnGroupTop
                        && columnSeparator.getBottom() >= columnGroupBottom
                        && columnSeparator.getLeft() >= columnGroupRight) {
                    if (columnSeparator.getLeft() < rightMargin)
                        rightMargin = columnSeparator.getLeft();
                }
            }
        }
        if (rightTabStats != null) {
            rightTab = rightTabStats.getMean();
        }

        LOG.debug("rightMargin: " + rightMargin);
        LOG.debug("rightTab: " + rightTab);

        // left thresholds
        LOG.debug("Calculating left thresholds");

        // first, create a DBScan cluster of all rows by their adjusted left coordinate
        List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>();
        List<double[]> leftCoordinates = new ArrayList<double[]>();

        for (Column column : columnGroup) {
            for (RowOfShapes row : column) {
                double left = row.getLeft() - row.getXAdjustment();
                //               double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage);
                //               if (leftOverlap == 0) {
                //                  // leave out any overlapping rows from margin calcs,
                //                  // since we need accurate statistics here
                //                  leftHandRows.add(row);
                //                  leftCoordinates.add(new double[] {left});
                //               }
                leftHandRows.add(row);
                leftCoordinates.add(new double[] { left });
            }
        }

        int minCardinalityForLeftMargin = 5;
        DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows,
                leftCoordinates);
        Set<Set<RowOfShapes>> leftRowClusters = leftMarginClusterer.cluster(epsilon,
                minCardinalityForLeftMargin, true);

        TreeSet<Set<RowOfShapes>> orderedLeftRowClusters = new TreeSet<Set<RowOfShapes>>(
                new CardinalityComparator<RowOfShapes>());
        orderedLeftRowClusters.addAll(leftRowClusters);

        i = 0;

        // find the two left-most clusters, and assume they are the margin & the tab
        DescriptiveStatistics leftMarginStats = null;
        DescriptiveStatistics leftTabStats = null;
        for (Set<RowOfShapes> cluster : orderedLeftRowClusters) {
            DescriptiveStatistics leftStats = new DescriptiveStatistics();
            MeanAbsoluteDeviation leftDev = new MeanAbsoluteDeviation();
            for (RowOfShapes row : cluster) {
                int rowIndex = leftHandRows.indexOf(row);
                double left = leftCoordinates.get(rowIndex)[0];
                leftStats.addValue(left);
                leftDev.increment(left);
            }

            LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
            LOG.debug("Left mean : " + leftStats.getMean());
            LOG.debug("Left dev: " + leftDev.getResult());

            if (cluster.size() >= minCardinalityForLeftMargin) {
                if (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean()) {
                    if (leftMarginStats != null)
                        leftTabStats = leftMarginStats;
                    leftMarginStats = leftStats;
                } else if (leftTabStats == null || leftTabStats.getMean() > leftStats.getMean()) {
                    leftTabStats = leftStats;
                }
            } else {
                break;
            }
            i++;
        } // next left-coordinate cluster

        double leftMargin = 0;
        double leftTab = 0;
        if (leftMarginStats != null) {
            leftMargin = leftMarginStats.getMean();
        } else {
            List<Rectangle> columnSeparators = sourceImage.findColumnSeparators();
            for (Rectangle columnSeparator : columnSeparators) {
                if (columnSeparator.getTop() <= columnGroupTop
                        && columnSeparator.getBottom() >= columnGroupBottom
                        && columnSeparator.getRight() <= columnGroupLeft) {
                    if (columnSeparator.getRight() > leftMargin)
                        leftMargin = columnSeparator.getRight();
                }
            }
        }
        if (leftTabStats != null) {
            leftTab = leftTabStats.getMean();
        }

        LOG.debug("leftMargin: " + leftMargin);
        LOG.debug("leftTab: " + leftTab);

        for (Column column : columnGroup) {
            if (sourceImage.isLeftToRight()) {
                column.startMargin = leftMargin;
                if (leftTabStats != null) {
                    column.startTab = leftTab;
                    column.hasTab = true;
                } else {
                    LOG.debug("No left tab - setting based on left margin");
                    column.startTab = leftMargin + (5.0 * sourceImage.getAverageShapeWidth());
                    column.hasTab = false;
                }

                column.endMargin = rightMargin;
            } else {
                column.startMargin = rightMargin;
                if (rightTabStats != null) {
                    column.startTab = rightTab;
                    column.hasTab = true;
                } else {
                    LOG.debug("No right tab - setting based on right margin");
                    column.startTab = rightMargin - (5.0 * sourceImage.getAverageShapeWidth());
                    column.hasTab = false;
                }

                column.endMargin = leftMargin;
            }
            LOG.debug("Margins for " + column);
            LOG.debug("startMargin: " + column.startMargin);
            LOG.debug("startTab: " + column.startTab);
            LOG.debug("endMargin: " + column.endMargin);
        } // next column
    } // next column group
    LOG.debug("hasTab: " + hasTab);

    double safetyMargin = 1.5 * sourceImage.getAverageShapeWidth();

    // Now, paragraphs are either "indented", "outdented" or not "dented" at all (no tabs).
    // This applies to the entire page.
    // To recognise indenting vs. outdenting, we have to see if the row preceding each
    // indent/outdent is full or partial. In the case of indentation, partial rows will
    // typically be followed by an indent. In the case of outdentation, partial rows will
    // typically be followed by an outdent.
    boolean isIndented = true;

    int indentCount = 0;
    int outdentCount = 0;
    for (List<Column> columnGroup : columnGroups) {
        LOG.debug("Next column group");
        boolean prevRowPartial = false;
        for (Column column : columnGroup) {
            if (column.hasTab) {
                for (RowOfShapes row : column) {
                    if (sourceImage.isLeftToRight()) {
                        if (prevRowPartial) {
                            if (row.getLeft() - row.getXAdjustment() > column.startTab - safetyMargin) {
                                indentCount++;
                            } else if (row.getLeft() - row.getXAdjustment() < column.startMargin
                                    + safetyMargin) {
                                outdentCount++;
                            }
                        }
                        if (row.getRight() - row.getXAdjustment() < column.endMargin - safetyMargin) {
                            prevRowPartial = true;
                        } else {
                            prevRowPartial = false;
                        }
                    } else {
                        if (prevRowPartial) {
                            if (row.getRight() - row.getXAdjustment() < column.startTab + safetyMargin) {
                                indentCount++;
                            } else if (row.getRight() - row.getXAdjustment() > column.startMargin
                                    - safetyMargin) {
                                outdentCount++;
                            }
                        }
                        if (row.getLeft() - row.getXAdjustment() > column.endMargin + safetyMargin) {
                            prevRowPartial = true;
                        } else {
                            prevRowPartial = false;
                        }
                    } // left-to-right?
                } // next row  
            } // column has tab
        } // next column
    } // next column group
    isIndented = (indentCount + 2 >= outdentCount);
    LOG.debug("indentCount: " + indentCount);
    LOG.debug("outdentCount: " + outdentCount);
    LOG.debug("isIndented: " + isIndented);

    // order the columns
    TreeSet<Column> orderedColumns = new TreeSet<SegmenterImpl.Column>(columns);
    columns.clear();
    columns.addAll(orderedColumns);

    // find the paragraphs found in each column
    for (Column column : columns) {
        LOG.debug("--- Next column ---");

        // break up the column into paragraphs 
        Paragraph paragraph = null;
        RowOfShapes previousRow = null;
        int maxShapesForStandaloneParagraph = 2;
        List<RowOfShapes> rowsForStandaloneParagraphs = new ArrayList<RowOfShapes>();
        Point2D previousPointStartMargin = null;
        Point2D previousPointStartTab = null;
        Point2D previousPointEndMargin = null;

        for (RowOfShapes row : column) {
            boolean rowForStandaloneParagraph = false;
            boolean newParagraph = false;
            if (row.getShapes().size() <= maxShapesForStandaloneParagraph) {
                rowsForStandaloneParagraphs.add(row);
                rowForStandaloneParagraph = true;
            } else {
                double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage);
                double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage);

                if (drawSegmentation) {
                    double rowVerticalMidPoint = row.getBaseLineMiddlePoint();
                    double startMarginX = column.startMargin + row.getXAdjustment();
                    double startTabX = column.startTab + row.getXAdjustment();
                    double endMarginX = column.endMargin + row.getXAdjustment();

                    if (sourceImage.isLeftToRight()) {
                        startMarginX += safetyMargin;
                        startTabX -= safetyMargin;
                        endMarginX -= safetyMargin;

                        startMarginX += leftOverlap;
                        startTabX += leftOverlap;
                        endMarginX -= rightOverlap;
                    } else {
                        startMarginX -= safetyMargin;
                        startTabX += safetyMargin;
                        endMarginX += safetyMargin;

                        startMarginX -= rightOverlap;
                        startTabX -= rightOverlap;
                        endMarginX += leftOverlap;
                    }

                    Point2D.Double currentPointStartMargin = new Point2D.Double(startMarginX,
                            rowVerticalMidPoint);
                    Point2D.Double currentPointStartTab = new Point2D.Double(startTabX, rowVerticalMidPoint);
                    Point2D.Double currentPointEndMargin = new Point2D.Double(endMarginX, rowVerticalMidPoint);

                    if (previousPointStartMargin != null) {
                        graphics2D.setStroke(new BasicStroke(1));
                        graphics2D.setPaint(Color.BLUE);
                        graphics2D.drawLine((int) Math.round(previousPointStartMargin.getX()),
                                (int) Math.round(previousPointStartMargin.getY()),
                                (int) Math.round(currentPointStartMargin.getX()),
                                (int) Math.round(currentPointStartMargin.getY()));
                        graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()),
                                (int) Math.round(previousPointEndMargin.getY()),
                                (int) Math.round(currentPointEndMargin.getX()),
                                (int) Math.round(currentPointEndMargin.getY()));

                        graphics2D.setPaint(Color.RED);
                        graphics2D.drawLine((int) Math.round(previousPointStartTab.getX()),
                                (int) Math.round(previousPointStartTab.getY()),
                                (int) Math.round(currentPointStartTab.getX()),
                                (int) Math.round(currentPointStartTab.getY()));

                        graphics2D.setPaint(Color.RED);
                        graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()),
                                (int) Math.round(previousPointEndMargin.getY()),
                                (int) Math.round(currentPointEndMargin.getX()),
                                (int) Math.round(currentPointEndMargin.getY()));
                    }
                    previousPointStartMargin = currentPointStartMargin;
                    previousPointStartTab = currentPointStartTab;
                    previousPointEndMargin = currentPointEndMargin;
                }

                if (previousRow == null) {
                    LOG.debug("New paragraph (first)");
                    newParagraph = true;
                } else {
                    if (sourceImage.isLeftToRight()) {
                        if (previousRow.getRight() - previousRow.getXAdjustment()
                                - rightOverlap < column.endMargin - safetyMargin) {
                            LOG.debug("New paragraph (previous EOP)");
                            newParagraph = true;
                        } else if (column.hasTab && isIndented && row.getLeft() - row.getXAdjustment()
                                + leftOverlap > column.startTab - safetyMargin) {
                            LOG.debug("New paragraph (indent)");
                            newParagraph = true;
                        } else if (column.hasTab && !isIndented && row.getLeft() - row.getXAdjustment()
                                + leftOverlap < column.startMargin + safetyMargin) {
                            LOG.debug("New paragraph (outdent)");
                            newParagraph = true;
                        }
                    } else {
                        if (previousRow.getLeft() - previousRow.getXAdjustment()
                                + leftOverlap > column.endMargin + safetyMargin) {
                            LOG.debug("New paragraph (previous EOP)");
                            newParagraph = true;
                        } else if (column.hasTab && isIndented && row.getRight() - row.getXAdjustment()
                                - rightOverlap < column.startTab + safetyMargin) {
                            LOG.debug("New paragraph (indent)");
                            newParagraph = true;
                        } else if (column.hasTab && !isIndented && row.getRight() - row.getXAdjustment()
                                - rightOverlap > column.startMargin - safetyMargin) {
                            LOG.debug("New paragraph (outdent)");
                            newParagraph = true;
                        }
                    } // left-to-right?
                } // have previous row
            } // standalone paragraph?

            if (!rowForStandaloneParagraph)
                LOG.debug(row.toString());

            if (newParagraph) {
                if (rowsForStandaloneParagraphs.size() > 0) {
                    for (RowOfShapes oneRow : rowsForStandaloneParagraphs) {
                        LOG.debug("Standalone paragraph");
                        LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop()
                                + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")");
                        Paragraph standaloneParagraph = sourceImage.newParagraph();
                        standaloneParagraph.getRows().add(oneRow);
                    }
                    rowsForStandaloneParagraphs.clear();
                }
                paragraph = sourceImage.newParagraph();
            }
            //LOG.debug("Row: left(" + row.getLeft() + "), right(" + row.getRight() + "), width(" + (row.getRight() - row.getLeft() + 1) + ")");

            if (!rowForStandaloneParagraph) {
                paragraph.getRows().add(row);
                previousRow = row;
            }
        } // next row in column
        if (rowsForStandaloneParagraphs.size() > 0) {
            for (RowOfShapes oneRow : rowsForStandaloneParagraphs) {
                LOG.debug("Standalone paragraph");
                LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right("
                        + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")");
                Paragraph standaloneParagraph = sourceImage.newParagraph();
                standaloneParagraph.getRows().add(oneRow);
            }
            rowsForStandaloneParagraphs.clear();
        }
    } // next column

}

From source file:org.apache.eagle.service.jpm.suggestion.AbstractGCFunc.java

private double getGcRatio(List<TaskExecutionAPIEntity> tasks) {
    if (tasks.isEmpty()) {
        return 0;
    }/*  w  ww  . jav  a  2 s.  c o m*/
    double[] gcMs = ResourceUtils.getCounterValues(tasks, JobCounters.CounterName.GC_MILLISECONDS);
    double[] cpuMs = ResourceUtils.getCounterValues(tasks, JobCounters.CounterName.CPU_MILLISECONDS);

    DescriptiveStatistics statistics = new DescriptiveStatistics();
    double averageCpuMs = statistics.getMeanImpl().evaluate(cpuMs);
    double averageGcMs = statistics.getMeanImpl().evaluate(gcMs);
    if (averageCpuMs == 0) {
        averageCpuMs = 1;
    }
    return averageGcMs / averageCpuMs;
}

From source file:org.apache.eagle.service.jpm.suggestion.AbstractInputFunc.java

@Override
public JobSuggestionResponse apply(TaskGroupResponse data) {
    MRTaskExecutionResponse.TaskGroup taskGroup = getTasks(data);
    double[] smallerGroup = ResourceUtils.getCounterValues(taskGroup.shortTasks, counterName);
    double[] largerGroup = ResourceUtils.getCounterValues(taskGroup.longTasks, counterName);
    DescriptiveStatistics statistics = new DescriptiveStatistics();
    double avgSmaller = statistics.getMeanImpl().evaluate(smallerGroup);
    double avgLarger = statistics.getMeanImpl().evaluate(largerGroup);

    List<MRTaskExecutionResponse.SuggestionResult> suggestionResults = getDeviationSuggest(avgSmaller,
            avgLarger);//from w w  w .  j  a va 2s.co m
    MRTaskExecutionResponse.JobSuggestionResponse response = new MRTaskExecutionResponse.JobSuggestionResponse();
    response.suggestionResults = suggestionResults;
    response.suggestionType = suggestType.toString();
    return response;
}

From source file:org.apache.jackrabbit.performance.AbstractPerformanceTest.java

private DescriptiveStatistics runTest(AbstractTest test, Repository repository) throws Exception {
    DescriptiveStatistics statistics = new DescriptiveStatistics();

    test.setUp(repository, credentials);
    try {// w w w. jav a2s  .  co m
        // Run a few iterations to warm up the system
        long warmupEnd = System.currentTimeMillis() + warmup * 1000;
        while (System.currentTimeMillis() < warmupEnd) {
            test.execute();
        }

        // Run test iterations, and capture the execution times
        long runtimeEnd = System.currentTimeMillis() + runtime * 1000;
        while (System.currentTimeMillis() < runtimeEnd) {
            statistics.addValue(test.execute());
        }
    } finally {
        test.tearDown();
    }

    return statistics;
}

From source file:org.apache.sling.junit.performance.listener.StatisticsListener.java

@Override
public void executionStarted(String className, String testName) throws Exception {
    statistics = new DescriptiveStatistics();
}