Example usage for org.apache.commons.math.stat.descriptive DescriptiveStatistics DescriptiveStatistics

List of usage examples for org.apache.commons.math.stat.descriptive DescriptiveStatistics DescriptiveStatistics

Introduction

In this page you can find the example usage for org.apache.commons.math.stat.descriptive DescriptiveStatistics DescriptiveStatistics.

Prototype

public DescriptiveStatistics() 

Source Link

Document

Construct a DescriptiveStatistics instance with an infinite window

Usage

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

List<RowOfShapes> groupShapesIntoRows(SourceImage sourceImage, List<Shape> shapes, List<Rectangle> whiteAreas,
        boolean useSlope) {
    LOG.debug("########## groupShapesIntoRows #########");
    LOG.debug("useSlope? " + useSlope);

    List<RowOfShapes> rows = new ArrayList<RowOfShapes>();
    for (Shape shape : shapes)
        shape.setRow(null);//from   w w  w.j a  v a2s. c  o  m

    List<Shape> shapesToRemove = new ArrayList<Shape>();
    for (Shape shape : shapes) {
        for (Rectangle whiteArea : whiteAreas) {
            double whiteAreaRight = whiteArea.getRight();
            double whiteAreaLeft = whiteArea.getLeft();
            if (useSlope) {
                double xAdjustment = sourceImage.getXAdjustment(shape.getTop());

                whiteAreaRight += xAdjustment;
                whiteAreaLeft += xAdjustment;
            }

            if (whiteAreaRight > shape.getRight() && whiteAreaLeft < shape.getLeft()
                    && whiteArea.getTop() < shape.getTop() && whiteArea.getBottom() > shape.getBottom()) {
                // shape is surrounded
                shapesToRemove.add(shape);
                LOG.debug("Removing shape " + shape);
                LOG.debug("Surrounded by white area: " + whiteArea);
            }
        }
    }
    shapes.removeAll(shapesToRemove);

    // calculate the means
    // get average shape width & height
    DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics();
    for (Shape shape : shapes) {
        shapeWidthStats.addValue(shape.getWidth());
    }
    double averageShapeWidth = shapeWidthStats.getPercentile(50);
    LOG.debug("averageShapeWidth: " + averageShapeWidth);

    // now, arrange the shapes in rows
    // we're guaranteed that no two shapes overlap at this point.
    // Now, it's possible that two shapes in the same line have no vertical overlap (e.g. a comma and an apostrophe)
    // so we have to go searching a bit further afield, say five shapes in each direction
    // but if we go too far, we may end up joining two lines together if the page isn't quite straight

    // let's begin with any old shape and find the shapes closest to it horizontally
    // e.g. up to 8 horizontal means to the right and left
    // as we find shapes that go with it, we add them to the same line
    int i = 0;
    int j = 0;
    int numberOfMeanWidthsForSearch = 8;
    LOG.debug("numberOfMeanWidthsForSearch: " + numberOfMeanWidthsForSearch);
    LOG.debug("search distance: " + averageShapeWidth * numberOfMeanWidthsForSearch);

    for (Shape shape : shapes) {
        if (shape.getRow() == null) {
            RowOfShapes row = graphicsService.getEmptyRow(sourceImage);
            row.addShape(shape);
            row.setIndex(j++);
            rows.add(row);
            LOG.trace("========= New row " + row.getIndex() + "============");
            LOG.trace("Adding " + shape + " to row " + row.getIndex());
        }
        int searchLeft = (int) ((double) shape.getLeft() - (numberOfMeanWidthsForSearch * averageShapeWidth));
        int searchRight = (int) ((double) shape.getRight() + (numberOfMeanWidthsForSearch * averageShapeWidth));
        LOG.trace("Shape " + i++ + ": " + shape + "(row " + shape.getRow().getIndex() + ")");
        LOG.trace("searchLeft: " + searchLeft);
        LOG.trace("searchRight: " + searchRight);

        // construct an array to represent where white areas overlap with the search area
        int[][] leftSearchArea = new int[shape.getLeft() - searchLeft][2];
        int[][] rightSearchArea = new int[searchRight - shape.getRight()][2];
        for (int k = 0; k < leftSearchArea.length; k++) {
            leftSearchArea[k][0] = shape.getTop();
            leftSearchArea[k][1] = shape.getBottom();
        }
        for (int k = 0; k < rightSearchArea.length; k++) {
            rightSearchArea[k][0] = shape.getTop();
            rightSearchArea[k][1] = shape.getBottom();
        }

        int newSearchLeft = searchLeft;
        int newSearchRight = searchRight;
        for (Rectangle whiteArea : whiteAreas) {
            double whiteAreaRight = whiteArea.getRight();
            double whiteAreaLeft = whiteArea.getLeft();
            if (useSlope) {
                double xAdjustment = sourceImage.getXAdjustment(shape.getTop());

                whiteAreaRight += xAdjustment;
                whiteAreaLeft += xAdjustment;
                LOG.trace(whiteArea + ", xAdjustment=" + xAdjustment + " , whiteAreaLeft=" + whiteAreaLeft
                        + " , whiteAreaRight=" + whiteAreaRight);
            }

            if (whiteAreaRight > newSearchLeft && whiteAreaLeft < shape.getLeft()
                    && whiteArea.getTop() <= shape.getBottom() && whiteArea.getBottom() >= shape.getTop()) {

                LOG.trace("overlap on left with: " + whiteArea.toString());

                if (whiteArea.getTop() <= shape.getTop() && whiteArea.getBottom() >= shape.getBottom()
                        && whiteAreaRight > newSearchLeft) {
                    newSearchLeft = (int) Math.round(whiteAreaRight);
                    LOG.trace("Complete, newSearchLeft = " + newSearchLeft);
                } else {
                    LOG.trace("Partial, starting at " + whiteArea.getRight());
                    for (int k = whiteArea.getRight() - searchLeft; k >= 0; k--) {
                        if (k < leftSearchArea.length) {
                            if (whiteArea.getBottom() < shape.getBottom()
                                    && leftSearchArea[k][0] < whiteArea.getBottom())
                                leftSearchArea[k][0] = whiteArea.getBottom() + 1;
                            else if (whiteArea.getTop() > shape.getTop()
                                    && leftSearchArea[k][1] > whiteArea.getTop())
                                leftSearchArea[k][1] = whiteArea.getTop() - 1;

                            if (leftSearchArea[k][0] >= leftSearchArea[k][1]
                                    && searchLeft + k > newSearchLeft) {
                                newSearchLeft = searchLeft + k;
                                LOG.trace("Complete from " + newSearchLeft);
                                break;
                            }
                        }
                    }
                    //                  if (LOG.isTraceEnabled()) {
                    //                     StringBuilder sb = new StringBuilder();
                    //                     for (int k=0;k<leftSearchArea.length;k++) {
                    //                        String top = "" + (leftSearchArea[k][0]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", top)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                     sb = new StringBuilder();
                    //                     for (int k=0;k<leftSearchArea.length;k++) {
                    //                        String bottom = "" + (leftSearchArea[k][1]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", bottom)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                  }
                }
            } else if (whiteAreaLeft < newSearchRight && whiteAreaRight > shape.getRight()
                    && whiteArea.getTop() <= shape.getBottom() && whiteArea.getBottom() >= shape.getTop()) {
                LOG.trace("overlap on right with: " + whiteArea.toString());

                if (whiteArea.getTop() <= shape.getTop() && whiteArea.getBottom() >= shape.getBottom()
                        && newSearchRight > whiteAreaLeft) {
                    newSearchRight = (int) Math.round(whiteAreaLeft);
                    LOG.trace("Complete, newSearchRight = " + newSearchRight);

                } else {
                    LOG.trace("Partial, starting at " + whiteArea.getLeft());
                    for (int k = whiteArea.getLeft() - shape.getRight(); k < rightSearchArea.length; k++) {
                        if (k > 0 && k < leftSearchArea.length && k < rightSearchArea.length) {
                            if (whiteArea.getBottom() < shape.getBottom()
                                    && leftSearchArea[k][0] < whiteArea.getBottom())
                                rightSearchArea[k][0] = whiteArea.getBottom() + 1;
                            else if (whiteArea.getTop() > shape.getTop()
                                    && leftSearchArea[k][1] > whiteArea.getTop())
                                rightSearchArea[k][1] = whiteArea.getTop() - 1;

                            if (rightSearchArea[k][0] >= rightSearchArea[k][1]
                                    && newSearchRight > shape.getRight() + k) {
                                newSearchRight = shape.getRight() + k;
                                LOG.trace("Complete from " + newSearchRight);
                                break;
                            }
                        }
                    }
                    //                  if (LOG.isTraceEnabled()) {
                    //                     StringBuilder sb = new StringBuilder();
                    //                     for (int k=0;k<rightSearchArea.length;k++) {
                    //                        String top = "" + (rightSearchArea[k][0]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", top)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                     sb = new StringBuilder();
                    //                     for (int k=0;k<rightSearchArea.length;k++) {
                    //                        String bottom = "" + (rightSearchArea[k][1]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", bottom)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                  }
                }
            }
        }
        LOG.trace("searchLeft adjusted for white columns: " + newSearchLeft);
        LOG.trace("searchRight adjusted for white columns: " + newSearchRight);

        // min 10% overlap to assume same row
        double minOverlap = 0.10;

        for (Shape otherShape : shapes) {
            boolean haveSomeOverlap = false;
            if (!shape.getRow().equals(otherShape.getRow()) && !otherShape.equals(shape)) {

                // shapes are arranged from the top down
                if (otherShape.getTop() > shape.getBottom()) {
                    break;
                }

                if (otherShape.getRight() > newSearchLeft && otherShape.getRight() < shape.getLeft()
                        && otherShape.getTop() <= shape.getBottom()
                        && otherShape.getBottom() >= shape.getTop()) {
                    int k = otherShape.getRight() - searchLeft;
                    if (otherShape.getTop() <= leftSearchArea[k][1]
                            && otherShape.getBottom() >= leftSearchArea[k][0])
                        haveSomeOverlap = true;
                } else if (otherShape.getLeft() < newSearchRight && otherShape.getLeft() > shape.getRight()
                        && otherShape.getTop() <= shape.getBottom()
                        && otherShape.getBottom() >= shape.getTop()) {
                    int k = otherShape.getLeft() - shape.getRight();
                    if (otherShape.getTop() <= rightSearchArea[k][1]
                            && otherShape.getBottom() >= rightSearchArea[k][0])
                        haveSomeOverlap = true;
                }
                if (haveSomeOverlap) {
                    int overlap1 = shape.getBottom() - otherShape.getTop() + 1;
                    int overlap2 = otherShape.getBottom() - shape.getTop() + 1;
                    int overlap = overlap1 < overlap2 ? overlap1 : overlap2;
                    boolean addShapeToRow = false;
                    if ((((double) overlap / (double) shape.getHeight()) > minOverlap)
                            || (((double) overlap / (double) otherShape.getHeight()) > minOverlap)) {
                        addShapeToRow = true;
                    }

                    if (addShapeToRow) {
                        LOG.debug("Adding " + otherShape + " to row " + shape.getRow().getIndex());
                        if (otherShape.getRow() == null) {
                            shape.getRow().addShape(otherShape);
                        } else {
                            // two rows need to be merged
                            LOG.debug("========= Merge rows " + shape.getRow().getIndex() + " with "
                                    + otherShape.getRow().getIndex() + "==========");
                            RowOfShapes otherRow = otherShape.getRow();
                            shape.getRow().addShapes(otherRow.getShapes());
                            rows.remove(otherRow);
                        }
                    }
                } // add shape to row ?
            } // should shape be considered?
        } // next other shape
    } // next shape

    return rows;
}

From source file:com.joliciel.jochre.lexicon.LexiconErrorWriter.java

static void mergeCrossValidation(File evalDir, String prefix) {
    try {// ww  w  .  j  av a  2  s. c om
        File[] files = evalDir.listFiles(new FilenameFilter() {
            @Override
            public boolean accept(File dir, String name) {
                if (name.endsWith(".csv"))
                    return true;
                else
                    return false;
            }
        });
        List<String> groupNames = new ArrayList<String>();
        Map<String, Writer> writers = new HashMap<String, Writer>();
        Map<String, ErrorStatistics> errorMap = new LinkedHashMap<String, ErrorStatistics>();
        Map<String, Map<String, DescriptiveStatistics>> statMap = new HashMap<String, Map<String, DescriptiveStatistics>>();
        for (File file : files) {
            String filename = file.getName();
            LOG.debug("Processing " + filename);
            int index = Integer.parseInt(filename.substring(prefix.length(), prefix.length() + 1));
            String suffix = filename.substring(prefix.length() + 2, filename.lastIndexOf('_'));
            String fileType = filename.substring(filename.lastIndexOf('_') + 1, filename.lastIndexOf('.'));
            LOG.debug("Processing " + filename);
            LOG.debug("index: " + index);
            LOG.debug("suffix: " + suffix);
            LOG.debug("fileType: " + fileType);
            Writer writer = writers.get(fileType);
            boolean firstFile = false;
            if (writer == null) {
                writer = new BufferedWriter(new OutputStreamWriter(
                        new FileOutputStream(
                                new File(evalDir, prefix + "A_" + suffix + "_" + fileType + ".csv"), false),
                        "UTF8"));
                writers.put(fileType, writer);
                firstFile = true;
            }
            if (fileType.equals("KEMatrix")) {
                Scanner scanner = new Scanner(file);
                int i = 0;
                List<String> myGroupNames = new ArrayList<String>();
                Map<String, Boolean> haveCountMap = new HashMap<String, Boolean>();
                while (scanner.hasNextLine()) {
                    String line = scanner.nextLine();
                    List<String> cells = CSV.getCSVCells(line);
                    if (i == 0) {
                        for (int j = 0; j < cells.size(); j += 5) {
                            String groupName = cells.get(j);
                            if (!errorMap.containsKey(groupName)) {
                                errorMap.put(groupName, new ErrorStatistics());
                                statMap.put(groupName, new HashMap<String, DescriptiveStatistics>());
                                groupNames.add(groupName);
                            }
                            myGroupNames.add(groupName);
                        }
                    } else if (i == 1) {
                        // do nothing
                    } else {
                        String rowName = cells.get(0);
                        int j = 0;
                        for (String groupName : myGroupNames) {
                            ErrorStatistics errorStats = errorMap.get(groupName);
                            Map<String, DescriptiveStatistics> stats = statMap.get(groupName);
                            double correctCount = Double.parseDouble(cells.get(j * 5 + 1));
                            double errorCount = Double.parseDouble(cells.get(j * 5 + 2));
                            double totalCount = Double.parseDouble(cells.get(j * 5 + 3));
                            Boolean haveCount = haveCountMap.get(groupName);

                            if (rowName.equals("known")) {
                                errorStats.knownWordCorrectCount += correctCount;
                                errorStats.knownWordErrorCount += errorCount;
                            } else if (rowName.equals("unknown")) {
                                errorStats.unknownWordCorrectCount += correctCount;
                                errorStats.unknownWordErrorCount += errorCount;
                            } else if (rowName.equals("goodSeg")) {
                                errorStats.goodSegCorrectCount += correctCount;
                                errorStats.goodSegErrorCount += errorCount;
                            } else if (rowName.equals("badSeg")) {
                                errorStats.badSegCorrectCount += correctCount;
                                errorStats.badSegErrorCount += errorCount;
                            } else if (rowName.equals("knownLetters")) {
                                errorStats.knownWordCorrectLetterCount += correctCount;
                                errorStats.knownWordErrorLetterCount += errorCount;
                            } else if (rowName.equals("unknownLetters")) {
                                errorStats.unknownWordCorrectLetterCount += correctCount;
                                errorStats.unknownWordErrorLetterCount += errorCount;
                            } else if (rowName.equals("goodSegLetters")) {
                                errorStats.goodSegCorrectLetterCount += correctCount;
                                errorStats.goodSegErrorLetterCount += errorCount;
                            } else if (rowName.equals("badSegLetters")) {
                                errorStats.badSegCorrectLetterCount += correctCount;
                                errorStats.badSegErrorLetterCount += errorCount;
                            } else if (rowName.equals("inBeam")) {
                                errorStats.answerInBeamCorrectCount += correctCount;
                                errorStats.answerInBeamErrorCount += errorCount;
                            } else if (rowName.equals("total")) {
                                haveCountMap.put(groupName, totalCount > 0);
                            } else if (rowName.endsWith("%")) {
                                if (haveCount) {
                                    String keyPrefix = rowName.substring(0, rowName.length() - 1);
                                    String key = keyPrefix + "|correct";
                                    DescriptiveStatistics correctStat = stats.get(key);
                                    if (correctStat == null) {
                                        correctStat = new DescriptiveStatistics();
                                        stats.put(key, correctStat);
                                    }
                                    correctStat.addValue(correctCount);
                                    key = keyPrefix + "|error";
                                    DescriptiveStatistics errorStat = stats.get(key);
                                    if (errorStat == null) {
                                        errorStat = new DescriptiveStatistics();
                                        stats.put(key, errorStat);
                                    }
                                    errorStat.addValue(errorCount);
                                    key = keyPrefix + "|total";
                                    DescriptiveStatistics totalStat = stats.get(key);
                                    if (totalStat == null) {
                                        totalStat = new DescriptiveStatistics();
                                        stats.put(key, totalStat);
                                    }
                                    totalStat.addValue(totalCount);
                                }
                            }

                            j++;
                        }
                    }
                    i++;
                }
            } else {
                Scanner scanner = new Scanner(file);
                boolean firstLine = true;
                while (scanner.hasNextLine()) {
                    String line = scanner.nextLine();
                    if (firstLine) {
                        if (firstFile)
                            writer.write(line + "\n");
                        firstLine = false;
                    } else {
                        writer.write(line + "\n");
                    }
                    writer.flush();
                }
            } // file type
        } // next file

        Writer statsWriter = writers.get("KEMatrix");
        writeStats(statsWriter, errorMap);
        statsWriter.write("\n");
        String[] statTypes = new String[] { "known", "unknown", "goodSeg", "badSeg", "inBeam", "total",
                "knownLetter", "unknownLetter", "goodSegLetter", "badSegLetter", "totalLetter" };
        for (String statType : statTypes) {
            for (String groupName : groupNames) {
                Map<String, DescriptiveStatistics> statsMap = statMap.get(groupName);
                DescriptiveStatistics correctStat = statsMap.get(statType + "|correct");
                DescriptiveStatistics errorStat = statsMap.get(statType + "|error");
                DescriptiveStatistics totalStat = statsMap.get(statType + "|total");

                statsWriter.write(CSV.format(statType + "%Avg") + CSV.format(correctStat.getMean())
                        + CSV.format(errorStat.getMean()) + CSV.format(totalStat.getMean())
                        + CSV.getCsvSeparator());

            } // next group
            statsWriter.write("\n");
            for (String groupName : groupNames) {
                Map<String, DescriptiveStatistics> statsMap = statMap.get(groupName);
                DescriptiveStatistics correctStat = statsMap.get(statType + "|correct");
                DescriptiveStatistics errorStat = statsMap.get(statType + "|error");
                DescriptiveStatistics totalStat = statsMap.get(statType + "|total");

                statsWriter.write(CSV.format(statType + "%Dev") + CSV.format(correctStat.getStandardDeviation())
                        + CSV.format(errorStat.getStandardDeviation())
                        + CSV.format(totalStat.getStandardDeviation()) + CSV.getCsvSeparator());

            } // next group
            statsWriter.write("\n");
            statsWriter.flush();
        }
        statsWriter.close();

    } catch (IOException e) {
        LogUtils.logError(LOG, e);
        throw new RuntimeException(e);
    }
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * We attempt to remove specks, where a speck is defined as 
 * a relatively small shape at a relatively large distance from other shapes.
 * @param sourceImage//from  ww  w. j a va 2s .  co  m
 */
void removeSpecks(SourceImage sourceImage, List<Shape> shapes) {
    LOG.debug("########## removeSpecks #########");

    DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics();
    DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics();

    for (Shape shape : shapes) {
        shapeWidthStats.addValue(shape.getWidth());
        shapeHeightStats.addValue(shape.getHeight());
    }

    double shapeWidthMedian = shapeWidthStats.getPercentile(65);
    double shapeHeightMedian = shapeHeightStats.getPercentile(65);
    LOG.debug("meanShapeWidth: " + shapeWidthMedian);
    LOG.debug("meanShapeHeight: " + shapeHeightMedian);

    int maxSpeckHeightFloor = (int) Math.ceil(shapeHeightMedian / 6.0);
    int maxSpeckWidthFloor = (int) Math.ceil(shapeWidthMedian / 6.0);
    int maxSpeckHeightCeiling = maxSpeckHeightFloor * 2;
    int maxSpeckWidthCeiling = maxSpeckWidthFloor * 2;

    int speckXDistanceThresholdFloor = (int) Math.floor(shapeWidthMedian);
    int speckYDistanceThresholdFloor = (int) Math.floor(shapeHeightMedian / 4.0);
    int speckXDistanceThresholdCeiling = speckXDistanceThresholdFloor * 2;
    int speckYDistanceThresholdCeiling = speckYDistanceThresholdFloor * 2;

    LOG.debug("maxSpeckHeightFloor=" + maxSpeckHeightFloor);
    LOG.debug("maxSpeckWidthFloor=" + maxSpeckWidthFloor);
    LOG.debug("speckXDistanceThresholdFloor=" + speckXDistanceThresholdFloor);
    LOG.debug("speckYDistanceThresholdFloor=" + speckYDistanceThresholdFloor);
    LOG.debug("maxSpeckHeightCeiling=" + maxSpeckHeightCeiling);
    LOG.debug("maxSpeckWidthCeiling=" + maxSpeckWidthCeiling);
    LOG.debug("speckXDistanceThresholdCeiling=" + speckXDistanceThresholdCeiling);
    LOG.debug("speckYDistanceThresholdCeiling=" + speckYDistanceThresholdCeiling);

    List<Shape> specks = new ArrayList<Shape>();
    List<double[]> speckCoordinates = new ArrayList<double[]>();
    for (Shape shape : shapes) {
        if (shape.getHeight() < maxSpeckHeightCeiling && shape.getWidth() < maxSpeckWidthCeiling) {
            specks.add(shape);
            speckCoordinates.add(shape.getCentrePoint());
        }
    }

    // group the specks into clusters, which will be added or removed as a whole
    // Note that a cluster could be a valid diacritic that's split into a few specks
    // or just a bunch of specks off on their own
    DBSCANClusterer<Shape> clusterer = new DBSCANClusterer<Shape>(specks, speckCoordinates);
    Set<Set<Shape>> speckClusters = clusterer.cluster(speckXDistanceThresholdFloor, 2, true);
    List<Shape> specksToRemove = new ArrayList<Shape>();
    for (Set<Shape> speckCluster : speckClusters) {

        int speckHeight = 0;
        int speckWidth = 0;
        int clusterTop = -1;
        int clusterBottom = -1;
        int clusterRight = -1;
        int clusterLeft = -1;
        for (Shape speck : speckCluster) {
            LOG.debug("Speck?, " + speck);
            if (speck.getWidth() > speckWidth)
                speckWidth = speck.getWidth();
            if (speck.getHeight() > speckHeight)
                speckHeight = speck.getHeight();

            if (clusterTop < 0 || speck.getTop() < clusterTop)
                clusterTop = speck.getTop();
            if (clusterLeft < 0 || speck.getLeft() < clusterLeft)
                clusterLeft = speck.getLeft();
            if (speck.getBottom() > clusterBottom)
                clusterBottom = speck.getBottom();
            if (speck.getRight() > clusterRight)
                clusterRight = speck.getRight();

        }

        boolean useWidth = speckWidth > speckHeight;
        double scale = 1.0;
        if (useWidth)
            scale = speckWidth < maxSpeckWidthFloor ? 0.0
                    : (speckWidth > maxSpeckWidthCeiling ? 1.0
                            : ((double) speckWidth - maxSpeckWidthFloor)
                                    / (maxSpeckWidthCeiling - maxSpeckWidthFloor));
        else
            scale = speckHeight < maxSpeckHeightFloor ? 0.0
                    : (speckHeight > maxSpeckHeightCeiling ? 1.0
                            : ((double) speckHeight - maxSpeckHeightFloor)
                                    / (maxSpeckHeightCeiling - maxSpeckHeightFloor));

        int speckXDistanceThreshold = (int) Math.ceil(speckXDistanceThresholdFloor
                + scale * (speckXDistanceThresholdCeiling - speckXDistanceThresholdFloor));
        int speckYDistanceThreshold = (int) Math.ceil(speckYDistanceThresholdFloor
                + scale * (speckYDistanceThresholdCeiling - speckYDistanceThresholdFloor));

        LOG.debug("speckHeight=" + speckHeight);
        LOG.debug("speckWidth=" + speckWidth);
        LOG.debug("speckXDistanceThreshold=" + speckXDistanceThreshold);
        LOG.debug("speckYDistanceThreshold=" + speckYDistanceThreshold);

        Shape nearestShape = null;
        double minDistance = 0.0;
        int nearestShapeXDiff = 0;
        int nearestShapeYDiff = 0;

        for (Shape otherShape : shapes) {
            // limit to nearby shapes
            if (otherShape.getTop() > clusterBottom + speckYDistanceThreshold + 1)
                break;
            if (otherShape.getBottom() < clusterTop - speckYDistanceThreshold - 1)
                continue;
            if (otherShape.getRight() < clusterLeft - speckXDistanceThreshold - 1)
                continue;
            if (otherShape.getLeft() > clusterRight + speckXDistanceThreshold + 1)
                continue;

            // Note: tried !specks.contains(otherShape), but sometimes we have a valid case
            // where a diacritic is "split" into two specks
            if (!specks.contains(otherShape)) {
                int xDiff = 0;
                int yDiff = 0;
                int leftDiff = 0;
                int rightDiff = 0;
                int topDiff = 0;
                int botDiff = 0;

                if (otherShape.getLeft() <= clusterRight && otherShape.getRight() >= clusterLeft) {
                    xDiff = 0;
                } else {
                    leftDiff = Math.abs(clusterLeft - otherShape.getRight());
                    rightDiff = Math.abs(clusterRight - otherShape.getLeft());
                    xDiff = (leftDiff < rightDiff) ? leftDiff : rightDiff;
                }

                if (otherShape.getTop() <= clusterBottom && otherShape.getBottom() >= clusterTop) {
                    yDiff = 0;
                } else {
                    int nearestTop = (otherShape.getTop() > otherShape.getTop() + otherShape.getMeanLine())
                            ? otherShape.getTop() + otherShape.getMeanLine()
                            : otherShape.getTop();
                    int nearestBot = (otherShape.getBottom() < otherShape.getTop() + otherShape.getBaseLine())
                            ? otherShape.getTop() + otherShape.getBaseLine()
                            : otherShape.getBottom();
                    topDiff = Math.abs(clusterTop - nearestBot);
                    botDiff = Math.abs(clusterBottom - nearestTop);
                    yDiff = (topDiff < botDiff) ? topDiff : botDiff;
                }

                double distance = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff));

                if (nearestShape == null || distance < minDistance) {
                    nearestShape = otherShape;
                    minDistance = distance;
                    nearestShapeXDiff = xDiff;
                    nearestShapeYDiff = yDiff;
                    LOG.trace("leftDiff=" + leftDiff + ", rightDiff=" + rightDiff);
                    LOG.trace("topDiff=" + topDiff + ", botDiff=" + botDiff);
                } // found closer shape?
            } // is this the speck?
        } // loop shapes around the reference shape

        if (nearestShape != null) {
            LOG.trace("Nearest shape, top(" + nearestShape.getTop() + ") " + "left(" + nearestShape.getLeft()
                    + ") " + "bot(" + nearestShape.getBottom() + ") " + "right(" + nearestShape.getRight()
                    + ")");
            LOG.trace("Distance=" + minDistance + ", xDiff=" + nearestShapeXDiff + ", yDiff="
                    + nearestShapeYDiff);
        }
        boolean removeSpecks = false;
        if (nearestShape == null)
            removeSpecks = true;
        else {
            // calculate the shortest distance from the nearest shape to the speck cluster
            for (Shape speck : speckCluster) {
                int xDiff = 0;
                int yDiff = 0;
                int leftDiff = 0;
                int rightDiff = 0;
                int topDiff = 0;
                int botDiff = 0;

                if (nearestShape.getLeft() <= speck.getRight() && nearestShape.getRight() >= speck.getLeft()) {
                    xDiff = 0;
                } else {
                    leftDiff = Math.abs(speck.getLeft() - nearestShape.getRight());
                    rightDiff = Math.abs(speck.getRight() - nearestShape.getLeft());
                    xDiff = (leftDiff < rightDiff) ? leftDiff : rightDiff;
                }

                if (nearestShape.getTop() <= speck.getBottom() && nearestShape.getBottom() >= speck.getTop()) {
                    yDiff = 0;
                } else {
                    int nearestTop = (nearestShape.getTop() > nearestShape.getTop()
                            + nearestShape.getMeanLine()) ? nearestShape.getTop() + nearestShape.getMeanLine()
                                    : nearestShape.getTop();
                    int nearestBot = (nearestShape.getBottom() < nearestShape.getTop()
                            + nearestShape.getBaseLine()) ? nearestShape.getTop() + nearestShape.getBaseLine()
                                    : nearestShape.getBottom();
                    topDiff = Math.abs(speck.getTop() - nearestBot);
                    botDiff = Math.abs(speck.getBottom() - nearestTop);
                    yDiff = (topDiff < botDiff) ? topDiff : botDiff;
                }

                double distance = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff));

                if (distance < minDistance) {
                    minDistance = distance;
                    nearestShapeXDiff = xDiff;
                    nearestShapeYDiff = yDiff;
                    LOG.debug("Found closer speck:");
                    LOG.debug("leftDiff=" + leftDiff + ", rightDiff=" + rightDiff);
                    LOG.debug("topDiff=" + topDiff + ", botDiff=" + botDiff);
                } // found closer shape?
            }
            // Then, for all of these specks, find the one that's closest to the nearest non-speck
            // if this distance > threshold, get rid of all of 'em
            // otherwise, keep 'em all
            if (nearestShapeXDiff > speckXDistanceThreshold || nearestShapeYDiff > speckYDistanceThreshold)
                removeSpecks = true;
        }
        if (removeSpecks) {
            for (Shape otherSpeck : speckCluster) {
                LOG.debug("Removing speck " + otherSpeck);
                specksToRemove.add(otherSpeck);
            }
        }
    } // next speck

    shapes.removeAll(specksToRemove);
}

From source file:com.facebook.presto.AbstractTestQueries.java

@Test
public void testTableSampleBernoulli() throws Exception {
    DescriptiveStatistics stats = new DescriptiveStatistics();

    int total = computeExpected("SELECT orderkey FROM orders", TupleInfo.SINGLE_LONG).getMaterializedTuples()
            .size();/*from w  w w  .  ja  va2 s.  c om*/

    for (int i = 0; i < 100; i++) {
        List<MaterializedTuple> values = computeActual("SELECT orderkey FROM ORDERS TABLESAMPLE BERNOULLI (50)")
                .getMaterializedTuples();

        assertEquals(values.size(), ImmutableSet.copyOf(values).size(), "TABLESAMPLE produced duplicate rows");
        stats.addValue(values.size() * 1.0 / total);
    }

    double mean = stats.getGeometricMean();
    assertTrue(mean > 0.45 && mean < 0.55,
            String.format("Expected mean sampling rate to be ~0.5, but was %s", mean));
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * Clear out anything found in the right & left margins
 * @param sourceImage//from   w ww.  j a v a2s  . c o m
 */
void cleanMargins(SourceImage sourceImage) {
    LOG.debug("########## cleanMargins #########");

    int minCardinalityForMargin = 8;
    double averageShapeWidth = sourceImage.getAverageShapeWidth();

    LOG.debug("Finding right margin");
    double rightLimit = (double) sourceImage.getWidth() * 0.67;

    // first, create a DBScan cluster of all rows near the right-hand side
    List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>();
    List<double[]> rightCoordinates = new ArrayList<double[]>();

    for (RowOfShapes row : sourceImage.getRows()) {
        double right = row.getRight();
        if (right >= rightLimit) {
            LOG.trace(row.toString());
            LOG.trace(
                    "Right: " + right + " + " + row.getXAdjustment() + " = " + (right - row.getXAdjustment()));
            right -= row.getXAdjustment();
            rightHandRows.add(row);
            rightCoordinates.add(new double[] { right });
        }
    }

    DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows,
            rightCoordinates);
    Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(averageShapeWidth, minCardinalityForMargin,
            true);

    TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>(
            new CardinalityComparator<RowOfShapes>());
    orderedRowClusters.addAll(rowClusters);

    int i = 0;

    // find the right-most cluster with sufficient cardinality, and assume it's the right margin
    DescriptiveStatistics rightMarginStats = null;
    for (Set<RowOfShapes> cluster : orderedRowClusters) {
        DescriptiveStatistics rightStats = new DescriptiveStatistics();
        for (RowOfShapes row : cluster)
            rightStats.addValue(row.getRight() - row.getXAdjustment());

        LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
        LOG.debug("Right mean : " + rightStats.getMean());
        LOG.debug("Right std dev: " + rightStats.getStandardDeviation());

        if (cluster.size() >= minCardinalityForMargin
                && (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean())) {
            rightMarginStats = rightStats;
        }
        i++;
    }

    // see how many rows would violate this margin - if too many, assume no margin
    // these rows are only rows which extend across the margin
    if (rightMarginStats != null) {
        LOG.debug("Right margin mean : " + rightMarginStats.getMean());
        LOG.debug("Right margin std dev: " + rightMarginStats.getStandardDeviation());

        double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth();
        LOG.debug("rightMarginLimit: " + rightMarginLimit);
        int numRowsToChop = 0;
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getRight() >= rightLimit) {
                if (row.getRight() - row.getXAdjustment() >= rightMarginLimit
                        && row.getLeft() - row.getXAdjustment() <= rightMarginLimit) {
                    LOG.debug("Found overlapping row : " + row);
                    LOG.debug("Adjusted right : " + (row.getRight() - row.getXAdjustment()));
                    numRowsToChop++;
                }
            }
        }
        if (numRowsToChop >= 3) {
            LOG.debug("Too many overlapping rows - ignoring margin");
            rightMarginStats = null;
        }
    }

    if (rightMarginStats != null) {
        double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth();
        List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            double right = row.getRight() - row.getXAdjustment();
            LOG.trace(row.toString());
            LOG.trace("Adjusted right: " + right);

            if (right >= rightMarginLimit) {
                LOG.trace("Has out-of-margin stuff!");
                // need to chop off groups to the right of this threshold
                List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>();
                for (GroupOfShapes group : row.getGroups()) {
                    if (group.getLeft() - row.getXAdjustment() > rightMarginLimit) {
                        groupsToChop.add(group);
                        LOG.debug("Chopping group outside of right margin: " + group);
                    }
                }
                for (GroupOfShapes group : groupsToChop) {
                    row.getShapes().removeAll(group.getShapes());
                }
                row.getGroups().removeAll(groupsToChop);

                if (row.getGroups().size() == 0) {
                    LOG.debug("Removing empty " + row);
                    rowsToRemove.add(row);
                } else {
                    row.recalculate();
                    row.assignGuideLines();
                }
            } // does this row extend beyond the margin?
        } // next row
        sourceImage.getRows().removeAll(rowsToRemove);
    } // have a right margin

    LOG.debug("Finding left margin");
    double leftLimit = (double) sourceImage.getWidth() * 0.33;

    // first, create a DBScan cluster of all rows near the left-hand side
    List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>();
    List<double[]> leftCoordinates = new ArrayList<double[]>();

    for (RowOfShapes row : sourceImage.getRows()) {
        double left = row.getLeft();
        if (left <= leftLimit) {
            LOG.trace(row.toString());
            LOG.trace("Left: " + left + " - " + row.getXAdjustment() + " = " + (left - row.getXAdjustment()));
            left -= row.getXAdjustment();
            leftHandRows.add(row);
            leftCoordinates.add(new double[] { left });
        }
    }

    DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows,
            leftCoordinates);
    Set<Set<RowOfShapes>> rowClustersLeft = leftMarginClusterer.cluster(averageShapeWidth,
            minCardinalityForMargin, true);

    TreeSet<Set<RowOfShapes>> orderedRowClustersLeft = new TreeSet<Set<RowOfShapes>>(
            new CardinalityComparator<RowOfShapes>());
    orderedRowClustersLeft.addAll(rowClustersLeft);

    i = 0;

    // find the left-most cluster with sufficient cardinality, and assume it's the left margin
    DescriptiveStatistics leftMarginStats = null;
    for (Set<RowOfShapes> cluster : orderedRowClustersLeft) {
        DescriptiveStatistics leftStats = new DescriptiveStatistics();
        for (RowOfShapes row : cluster)
            leftStats.addValue(row.getLeft() - row.getXAdjustment());

        LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
        LOG.debug("Left mean : " + leftStats.getMean());
        LOG.debug("Left std dev: " + leftStats.getStandardDeviation());

        if (cluster.size() >= minCardinalityForMargin
                && (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean())) {
            leftMarginStats = leftStats;
        }
        i++;
    }

    // see how many rows would violate this margin - if too many, assume no margin
    // these rows are only rows which extend across the margin
    if (leftMarginStats != null) {
        LOG.debug("Left margin mean : " + leftMarginStats.getMean());
        LOG.debug("Left margin std dev: " + leftMarginStats.getStandardDeviation());

        double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth();
        LOG.debug("leftMarginLimit: " + leftMarginLimit);
        int numRowsToChop = 0;
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getLeft() <= leftLimit) {
                if (row.getLeft() - row.getXAdjustment() <= leftMarginLimit
                        && row.getRight() - row.getXAdjustment() >= leftMarginLimit) {
                    LOG.debug("Found overlapping row : " + row);
                    LOG.debug("Adjusted left : " + (row.getLeft() - row.getXAdjustment()));
                    numRowsToChop++;
                }
            }
        }
        if (numRowsToChop >= 3) {
            LOG.debug("Too many overlapping rows - ignoring margin");
            leftMarginStats = null;
        }
    }

    if (leftMarginStats != null) {
        double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth();
        List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            double left = row.getLeft() - row.getXAdjustment();
            LOG.trace(row.toString());
            LOG.trace("Adjusted left: " + left);

            if (left <= leftMarginLimit) {
                LOG.trace("Has out-of-margin stuff!");
                // need to chop off groups to the left of this threshold
                List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>();
                for (GroupOfShapes group : row.getGroups()) {
                    if (group.getRight() - row.getXAdjustment() < leftMarginLimit) {
                        groupsToChop.add(group);
                        LOG.debug("Chopping group outside of left margin: " + group);
                    }
                }
                for (GroupOfShapes group : groupsToChop) {
                    row.getShapes().removeAll(group.getShapes());
                }
                row.getGroups().removeAll(groupsToChop);

                if (row.getGroups().size() == 0) {
                    LOG.debug("Removing empty " + row);
                    rowsToRemove.add(row);
                } else {
                    row.recalculate();
                    row.assignGuideLines();
                }
            } // does this row extend beyond the margin?
        } // next row
        sourceImage.getRows().removeAll(rowsToRemove);
    } // have a left margin
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * Detects paragraph splits and assign rows to correct paragraphs.
 * @param sourceImage//from  w w w  .  jav a 2 s  .  c o m
 */
void groupRowsIntoParagraphs(SourceImage sourceImage) {
    LOG.debug("########## groupRowsIntoParagraphs #########");
    // We'll use various possible indicators, including
    // indented start, indented end, and spacing between rows.

    // On pages with a single big paragraph makes it hypersensitive to differences in row-start/row-end
    // This means we cannot use deviation. Instead, we use the average shape width on the page.
    // We also adjust maxLeft & minRight to match the vertical line slope

    // This is now complicated by the possibility of multiple columns

    // Need to take into account a big horizontal space - Pietrushka page 14
    // Find horizontal spaces that go all the way across and are wider than a certain threshold
    // simply do a boolean column and black out everything in a row, than see if there are any remaining spaces above a certain threshold
    // Columns are thus arranged into "areas", separated by white-space.
    boolean[] fullRows = new boolean[sourceImage.getHeight()];
    for (RowOfShapes row : sourceImage.getRows()) {
        for (int y = row.getTop(); y <= row.getBottom(); y++) {
            fullRows[y] = true;
        }
    }
    DescriptiveStatistics rowHeightStats = new DescriptiveStatistics();

    for (RowOfShapes row : sourceImage.getRows()) {
        int height = row.getXHeight();
        rowHeightStats.addValue(height);
    }
    double avgRowHeight = rowHeightStats.getPercentile(50);
    LOG.debug("meanRowHeight: " + avgRowHeight);
    double minHeightForWhiteSpace = avgRowHeight * 1.3;
    LOG.debug("minHeightForWhiteSpace: " + minHeightForWhiteSpace);

    // find the "white rows" - any horizontal white space
    // in the page which is sufficiently high
    List<int[]> whiteRows = new ArrayList<int[]>();
    boolean inWhite = false;
    int startWhite = 0;
    for (int y = 0; y < sourceImage.getHeight(); y++) {
        if (!inWhite && !fullRows[y]) {
            inWhite = true;
            startWhite = y;
        } else if (inWhite && fullRows[y]) {
            int length = y - startWhite;
            if (length > minHeightForWhiteSpace) {
                LOG.debug("Adding whiteRow " + startWhite + "," + (y - 1));
                whiteRows.add(new int[] { startWhite, y - 1 });
            }
            inWhite = false;
        }
    }
    if (inWhite)
        whiteRows.add(new int[] { startWhite, sourceImage.getHeight() - 1 });
    whiteRows.add(new int[] { sourceImage.getHeight(), sourceImage.getHeight() });

    // place rows in "areas" defined by the "white rows" found above
    List<List<RowOfShapes>> areas = new ArrayList<List<RowOfShapes>>();
    int startY = -1;
    for (int[] whiteRow : whiteRows) {
        List<RowOfShapes> area = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getTop() >= startY && row.getBottom() <= whiteRow[0]) {
                area.add(row);
            }
        }
        if (area.size() > 0) {
            areas.add(area);
        }
        startY = whiteRow[1];
    }

    // break up each area into vertical columns
    LOG.debug("break up each area into vertical columns");
    List<Column> columns = new ArrayList<Column>();
    List<List<Column>> columnsPerAreaList = new ArrayList<List<Column>>();
    for (List<RowOfShapes> area : areas) {
        LOG.debug("Next area");
        List<Column> columnsPerArea = new ArrayList<SegmenterImpl.Column>();
        columnsPerAreaList.add(columnsPerArea);
        TreeSet<RowOfShapes> rows = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator());
        rows.addAll(area);
        for (RowOfShapes row : rows) {
            // try to place this row in one of the columns directly above it.
            // this means that a row which overlaps more than one column has to "close" this column, so it is no longer considered
            List<Column> overlappingColumns = new ArrayList<Column>();
            for (Column column : columnsPerArea) {
                if (!column.closed) {
                    RowOfShapes lastRowInColumn = column.get(column.size() - 1);
                    if (row.getRight() - row.getXAdjustment() >= lastRowInColumn.getLeft()
                            - lastRowInColumn.getXAdjustment()
                            && row.getLeft() - row.getXAdjustment() <= lastRowInColumn.getRight()
                                    - lastRowInColumn.getXAdjustment()) {
                        overlappingColumns.add(column);
                    }
                }
            }
            if (overlappingColumns.size() == 1) {
                Column myColumn = overlappingColumns.get(0);
                RowOfShapes lastRowInMyColumn = myColumn.get(0);

                // close any columns that are now at a distance of more than one row
                for (Column column : columnsPerArea) {
                    if (!column.closed && !column.equals(myColumn)) {
                        RowOfShapes lastRowInColumn = column.get(column.size() - 1);
                        if (lastRowInMyColumn.getTop() > lastRowInColumn.getBottom()) {
                            column.closed = true;
                            LOG.debug("Closing distant column " + lastRowInColumn);
                        }
                    }
                }

                myColumn.add(row);
                LOG.debug(row.toString());
                LOG.debug("  added to column " + lastRowInMyColumn);
            } else {
                for (Column overlappingColumn : overlappingColumns) {
                    overlappingColumn.closed = true;
                    RowOfShapes lastRowInColumn = overlappingColumn.get(overlappingColumn.size() - 1);
                    LOG.debug("Closing overlapping column " + lastRowInColumn);
                }
                Column myColumn = new Column(sourceImage);
                myColumn.add(row);
                LOG.debug("Found new column");
                LOG.debug(row.toString());
                columns.add(myColumn);
                columnsPerArea.add(myColumn);
            }
        }
    } // next area

    for (Column column : columns)
        column.recalculate();

    // Intermediate step to reform the vertical columns, if they exist
    // basically the idea is that if the columns are aligned vertically, then the thresholds for paragraph indents
    // should be shared, to increase the statistical sample size and reduce anomalies.
    // We'll assume that two columns from two consecutive areas are in the same vertical group if they overlap with each other horizontally
    // and don't overlap with any other column in the other column's area.
    List<List<Column>> columnGroups = new ArrayList<List<Column>>();
    List<Column> columnsInPrevArea = null;
    for (List<Column> columnsPerArea : columnsPerAreaList) {
        if (columnsInPrevArea != null) {
            for (Column prevColumn : columnsInPrevArea) {
                LOG.debug("Checking " + prevColumn);
                // find the column group containing the previous column
                List<Column> myColumnGroup = null;
                for (List<Column> columnGroup : columnGroups) {
                    if (columnGroup.contains(prevColumn)) {
                        myColumnGroup = columnGroup;
                        break;
                    }
                }
                if (myColumnGroup == null) {
                    myColumnGroup = new ArrayList<SegmenterImpl.Column>();
                    LOG.debug("Creating column group for column " + prevColumn.toString());
                    columnGroups.add(myColumnGroup);
                    myColumnGroup.add(prevColumn);
                }

                // does only one column overlap with this one?
                Column overlappingColumn = null;
                for (Column column : columnsPerArea) {
                    if (column.adjustedRight >= prevColumn.adjustedLeft
                            && column.adjustedLeft <= prevColumn.adjustedRight) {
                        if (overlappingColumn == null) {
                            LOG.debug("I overlap with " + column);

                            overlappingColumn = column;
                        } else {
                            LOG.debug("But I overlap also with " + column);

                            overlappingColumn = null;
                            break;
                        }
                    }
                }
                if (overlappingColumn != null) {
                    // does it overlap with only me?
                    for (Column otherPrevColumn : columnsInPrevArea) {
                        if (otherPrevColumn.equals(prevColumn))
                            continue;
                        if (overlappingColumn.adjustedRight >= otherPrevColumn.adjustedLeft
                                && overlappingColumn.adjustedLeft <= otherPrevColumn.adjustedRight) {
                            LOG.debug("But it overlaps also with " + otherPrevColumn);
                            overlappingColumn = null;
                            break;
                        }
                    }
                }
                if (overlappingColumn != null) {
                    myColumnGroup.add(overlappingColumn);
                    LOG.debug("Adding " + overlappingColumn);
                    LOG.debug(" to group with " + prevColumn);
                }

            } // next previous column
        } // have previous columns
        columnsInPrevArea = columnsPerArea;
    } // next area
    if (columnsInPrevArea != null) {
        for (Column prevColumn : columnsInPrevArea) {
            // find the column group containing the previous column
            List<Column> myColumnGroup = null;
            for (List<Column> columnGroup : columnGroups) {
                if (columnGroup.contains(prevColumn)) {
                    myColumnGroup = columnGroup;
                    break;
                }
            }
            if (myColumnGroup == null) {
                myColumnGroup = new ArrayList<SegmenterImpl.Column>();
                LOG.debug("Creating column group for column " + prevColumn.toString());
                columnGroups.add(myColumnGroup);
                myColumnGroup.add(prevColumn);
            }
        }
    }

    // What we really want here is, for each column (in the case of right-to-left),
    // two clusters on the right
    // and one relatively big cluster on the left.
    // anything outside of the cluster on the left is an EOP.
    boolean hasTab = false;
    for (List<Column> columnGroup : columnGroups) {
        LOG.debug("Next column group");
        double averageShapeWidth = sourceImage.getAverageShapeWidth();
        LOG.debug("averageShapeWidth: " + averageShapeWidth);
        double epsilon = averageShapeWidth / 2.0;
        LOG.debug("epsilon: " + epsilon);

        int columnGroupTop = sourceImage.getHeight();
        int columnGroupBottom = 0;
        int columnGroupLeft = sourceImage.getWidth();
        int columnGroupRight = 0;
        for (Column column : columnGroup) {
            if (column.top < columnGroupTop)
                columnGroupTop = (int) Math.round(column.top);
            if (column.bottom > columnGroupBottom)
                columnGroupBottom = (int) Math.round(column.bottom);
            if (column.adjustedLeft < columnGroupLeft)
                columnGroupLeft = (int) Math.round(column.adjustedLeft);
            if (column.adjustedRight > columnGroupRight)
                columnGroupRight = (int) Math.round(column.adjustedRight);
        }

        // right thresholds
        LOG.debug("Calculating right thresholds");

        // first, create a DBScan cluster of all rows by their adjusted right coordinate
        List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>();
        List<double[]> rightCoordinates = new ArrayList<double[]>();

        for (Column column : columnGroup) {
            for (RowOfShapes row : column) {
                double right = row.getRight() - row.getXAdjustment();
                //               double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage);
                //               if (rightOverlap==0) {
                //                  // leave out any right-overlapping rows here
                //                  // since we need accurate statistics for margin detection
                //               // This is questionable - especially since a long vertical bar (see Petriushka)
                //               // tends to give all rows a left overlap. Also, because the overlap is calculated based
                //               // on the mean right & mean left, not based on any sort of margin clusters.
                //                  rightHandRows.add(row);
                //                  rightCoordinates.add(new double[] {right});
                //               }
                rightHandRows.add(row);
                rightCoordinates.add(new double[] { right });

            }
        }

        int minCardinalityForRightMargin = 5;
        DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows,
                rightCoordinates);
        Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(epsilon, minCardinalityForRightMargin,
                true);

        TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>(
                new CardinalityComparator<RowOfShapes>());
        orderedRowClusters.addAll(rowClusters);

        int i = 0;

        // find the two right-most clusters, and assume they are the margin & the tab
        DescriptiveStatistics rightMarginStats = null;
        DescriptiveStatistics rightTabStats = null;
        for (Set<RowOfShapes> cluster : orderedRowClusters) {
            DescriptiveStatistics rightStats = new DescriptiveStatistics();
            MeanAbsoluteDeviation rightDev = new MeanAbsoluteDeviation();
            for (RowOfShapes row : cluster) {
                int rowIndex = rightHandRows.indexOf(row);
                double right = rightCoordinates.get(rowIndex)[0];
                rightStats.addValue(right);
                rightDev.increment(right);
            }

            LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
            LOG.debug("Right mean : " + rightStats.getMean());
            LOG.debug("Right dev: " + rightDev.getResult());

            if (cluster.size() >= minCardinalityForRightMargin) {
                if (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean()) {
                    if (rightMarginStats != null)
                        rightTabStats = rightMarginStats;
                    rightMarginStats = rightStats;
                } else if (rightTabStats == null || rightTabStats.getMean() < rightStats.getMean()) {
                    rightTabStats = rightStats;
                }
            } else {
                break;
            }
            i++;
        } // next right-coordinate cluster

        double rightMargin = sourceImage.getWidth();
        double rightTab = sourceImage.getWidth();
        if (rightMarginStats != null) {
            rightMargin = rightMarginStats.getMean();
        } else {
            List<Rectangle> columnSeparators = sourceImage.findColumnSeparators();
            for (Rectangle columnSeparator : columnSeparators) {
                if (columnSeparator.getTop() <= columnGroupTop
                        && columnSeparator.getBottom() >= columnGroupBottom
                        && columnSeparator.getLeft() >= columnGroupRight) {
                    if (columnSeparator.getLeft() < rightMargin)
                        rightMargin = columnSeparator.getLeft();
                }
            }
        }
        if (rightTabStats != null) {
            rightTab = rightTabStats.getMean();
        }

        LOG.debug("rightMargin: " + rightMargin);
        LOG.debug("rightTab: " + rightTab);

        // left thresholds
        LOG.debug("Calculating left thresholds");

        // first, create a DBScan cluster of all rows by their adjusted left coordinate
        List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>();
        List<double[]> leftCoordinates = new ArrayList<double[]>();

        for (Column column : columnGroup) {
            for (RowOfShapes row : column) {
                double left = row.getLeft() - row.getXAdjustment();
                //               double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage);
                //               if (leftOverlap == 0) {
                //                  // leave out any overlapping rows from margin calcs,
                //                  // since we need accurate statistics here
                //                  leftHandRows.add(row);
                //                  leftCoordinates.add(new double[] {left});
                //               }
                leftHandRows.add(row);
                leftCoordinates.add(new double[] { left });
            }
        }

        int minCardinalityForLeftMargin = 5;
        DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows,
                leftCoordinates);
        Set<Set<RowOfShapes>> leftRowClusters = leftMarginClusterer.cluster(epsilon,
                minCardinalityForLeftMargin, true);

        TreeSet<Set<RowOfShapes>> orderedLeftRowClusters = new TreeSet<Set<RowOfShapes>>(
                new CardinalityComparator<RowOfShapes>());
        orderedLeftRowClusters.addAll(leftRowClusters);

        i = 0;

        // find the two left-most clusters, and assume they are the margin & the tab
        DescriptiveStatistics leftMarginStats = null;
        DescriptiveStatistics leftTabStats = null;
        for (Set<RowOfShapes> cluster : orderedLeftRowClusters) {
            DescriptiveStatistics leftStats = new DescriptiveStatistics();
            MeanAbsoluteDeviation leftDev = new MeanAbsoluteDeviation();
            for (RowOfShapes row : cluster) {
                int rowIndex = leftHandRows.indexOf(row);
                double left = leftCoordinates.get(rowIndex)[0];
                leftStats.addValue(left);
                leftDev.increment(left);
            }

            LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
            LOG.debug("Left mean : " + leftStats.getMean());
            LOG.debug("Left dev: " + leftDev.getResult());

            if (cluster.size() >= minCardinalityForLeftMargin) {
                if (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean()) {
                    if (leftMarginStats != null)
                        leftTabStats = leftMarginStats;
                    leftMarginStats = leftStats;
                } else if (leftTabStats == null || leftTabStats.getMean() > leftStats.getMean()) {
                    leftTabStats = leftStats;
                }
            } else {
                break;
            }
            i++;
        } // next left-coordinate cluster

        double leftMargin = 0;
        double leftTab = 0;
        if (leftMarginStats != null) {
            leftMargin = leftMarginStats.getMean();
        } else {
            List<Rectangle> columnSeparators = sourceImage.findColumnSeparators();
            for (Rectangle columnSeparator : columnSeparators) {
                if (columnSeparator.getTop() <= columnGroupTop
                        && columnSeparator.getBottom() >= columnGroupBottom
                        && columnSeparator.getRight() <= columnGroupLeft) {
                    if (columnSeparator.getRight() > leftMargin)
                        leftMargin = columnSeparator.getRight();
                }
            }
        }
        if (leftTabStats != null) {
            leftTab = leftTabStats.getMean();
        }

        LOG.debug("leftMargin: " + leftMargin);
        LOG.debug("leftTab: " + leftTab);

        for (Column column : columnGroup) {
            if (sourceImage.isLeftToRight()) {
                column.startMargin = leftMargin;
                if (leftTabStats != null) {
                    column.startTab = leftTab;
                    column.hasTab = true;
                } else {
                    LOG.debug("No left tab - setting based on left margin");
                    column.startTab = leftMargin + (5.0 * sourceImage.getAverageShapeWidth());
                    column.hasTab = false;
                }

                column.endMargin = rightMargin;
            } else {
                column.startMargin = rightMargin;
                if (rightTabStats != null) {
                    column.startTab = rightTab;
                    column.hasTab = true;
                } else {
                    LOG.debug("No right tab - setting based on right margin");
                    column.startTab = rightMargin - (5.0 * sourceImage.getAverageShapeWidth());
                    column.hasTab = false;
                }

                column.endMargin = leftMargin;
            }
            LOG.debug("Margins for " + column);
            LOG.debug("startMargin: " + column.startMargin);
            LOG.debug("startTab: " + column.startTab);
            LOG.debug("endMargin: " + column.endMargin);
        } // next column
    } // next column group
    LOG.debug("hasTab: " + hasTab);

    double safetyMargin = 1.5 * sourceImage.getAverageShapeWidth();

    // Now, paragraphs are either "indented", "outdented" or not "dented" at all (no tabs).
    // This applies to the entire page.
    // To recognise indenting vs. outdenting, we have to see if the row preceding each
    // indent/outdent is full or partial. In the case of indentation, partial rows will
    // typically be followed by an indent. In the case of outdentation, partial rows will
    // typically be followed by an outdent.
    boolean isIndented = true;

    int indentCount = 0;
    int outdentCount = 0;
    for (List<Column> columnGroup : columnGroups) {
        LOG.debug("Next column group");
        boolean prevRowPartial = false;
        for (Column column : columnGroup) {
            if (column.hasTab) {
                for (RowOfShapes row : column) {
                    if (sourceImage.isLeftToRight()) {
                        if (prevRowPartial) {
                            if (row.getLeft() - row.getXAdjustment() > column.startTab - safetyMargin) {
                                indentCount++;
                            } else if (row.getLeft() - row.getXAdjustment() < column.startMargin
                                    + safetyMargin) {
                                outdentCount++;
                            }
                        }
                        if (row.getRight() - row.getXAdjustment() < column.endMargin - safetyMargin) {
                            prevRowPartial = true;
                        } else {
                            prevRowPartial = false;
                        }
                    } else {
                        if (prevRowPartial) {
                            if (row.getRight() - row.getXAdjustment() < column.startTab + safetyMargin) {
                                indentCount++;
                            } else if (row.getRight() - row.getXAdjustment() > column.startMargin
                                    - safetyMargin) {
                                outdentCount++;
                            }
                        }
                        if (row.getLeft() - row.getXAdjustment() > column.endMargin + safetyMargin) {
                            prevRowPartial = true;
                        } else {
                            prevRowPartial = false;
                        }
                    } // left-to-right?
                } // next row  
            } // column has tab
        } // next column
    } // next column group
    isIndented = (indentCount + 2 >= outdentCount);
    LOG.debug("indentCount: " + indentCount);
    LOG.debug("outdentCount: " + outdentCount);
    LOG.debug("isIndented: " + isIndented);

    // order the columns
    TreeSet<Column> orderedColumns = new TreeSet<SegmenterImpl.Column>(columns);
    columns.clear();
    columns.addAll(orderedColumns);

    // find the paragraphs found in each column
    for (Column column : columns) {
        LOG.debug("--- Next column ---");

        // break up the column into paragraphs 
        Paragraph paragraph = null;
        RowOfShapes previousRow = null;
        int maxShapesForStandaloneParagraph = 2;
        List<RowOfShapes> rowsForStandaloneParagraphs = new ArrayList<RowOfShapes>();
        Point2D previousPointStartMargin = null;
        Point2D previousPointStartTab = null;
        Point2D previousPointEndMargin = null;

        for (RowOfShapes row : column) {
            boolean rowForStandaloneParagraph = false;
            boolean newParagraph = false;
            if (row.getShapes().size() <= maxShapesForStandaloneParagraph) {
                rowsForStandaloneParagraphs.add(row);
                rowForStandaloneParagraph = true;
            } else {
                double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage);
                double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage);

                if (drawSegmentation) {
                    double rowVerticalMidPoint = row.getBaseLineMiddlePoint();
                    double startMarginX = column.startMargin + row.getXAdjustment();
                    double startTabX = column.startTab + row.getXAdjustment();
                    double endMarginX = column.endMargin + row.getXAdjustment();

                    if (sourceImage.isLeftToRight()) {
                        startMarginX += safetyMargin;
                        startTabX -= safetyMargin;
                        endMarginX -= safetyMargin;

                        startMarginX += leftOverlap;
                        startTabX += leftOverlap;
                        endMarginX -= rightOverlap;
                    } else {
                        startMarginX -= safetyMargin;
                        startTabX += safetyMargin;
                        endMarginX += safetyMargin;

                        startMarginX -= rightOverlap;
                        startTabX -= rightOverlap;
                        endMarginX += leftOverlap;
                    }

                    Point2D.Double currentPointStartMargin = new Point2D.Double(startMarginX,
                            rowVerticalMidPoint);
                    Point2D.Double currentPointStartTab = new Point2D.Double(startTabX, rowVerticalMidPoint);
                    Point2D.Double currentPointEndMargin = new Point2D.Double(endMarginX, rowVerticalMidPoint);

                    if (previousPointStartMargin != null) {
                        graphics2D.setStroke(new BasicStroke(1));
                        graphics2D.setPaint(Color.BLUE);
                        graphics2D.drawLine((int) Math.round(previousPointStartMargin.getX()),
                                (int) Math.round(previousPointStartMargin.getY()),
                                (int) Math.round(currentPointStartMargin.getX()),
                                (int) Math.round(currentPointStartMargin.getY()));
                        graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()),
                                (int) Math.round(previousPointEndMargin.getY()),
                                (int) Math.round(currentPointEndMargin.getX()),
                                (int) Math.round(currentPointEndMargin.getY()));

                        graphics2D.setPaint(Color.RED);
                        graphics2D.drawLine((int) Math.round(previousPointStartTab.getX()),
                                (int) Math.round(previousPointStartTab.getY()),
                                (int) Math.round(currentPointStartTab.getX()),
                                (int) Math.round(currentPointStartTab.getY()));

                        graphics2D.setPaint(Color.RED);
                        graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()),
                                (int) Math.round(previousPointEndMargin.getY()),
                                (int) Math.round(currentPointEndMargin.getX()),
                                (int) Math.round(currentPointEndMargin.getY()));
                    }
                    previousPointStartMargin = currentPointStartMargin;
                    previousPointStartTab = currentPointStartTab;
                    previousPointEndMargin = currentPointEndMargin;
                }

                if (previousRow == null) {
                    LOG.debug("New paragraph (first)");
                    newParagraph = true;
                } else {
                    if (sourceImage.isLeftToRight()) {
                        if (previousRow.getRight() - previousRow.getXAdjustment()
                                - rightOverlap < column.endMargin - safetyMargin) {
                            LOG.debug("New paragraph (previous EOP)");
                            newParagraph = true;
                        } else if (column.hasTab && isIndented && row.getLeft() - row.getXAdjustment()
                                + leftOverlap > column.startTab - safetyMargin) {
                            LOG.debug("New paragraph (indent)");
                            newParagraph = true;
                        } else if (column.hasTab && !isIndented && row.getLeft() - row.getXAdjustment()
                                + leftOverlap < column.startMargin + safetyMargin) {
                            LOG.debug("New paragraph (outdent)");
                            newParagraph = true;
                        }
                    } else {
                        if (previousRow.getLeft() - previousRow.getXAdjustment()
                                + leftOverlap > column.endMargin + safetyMargin) {
                            LOG.debug("New paragraph (previous EOP)");
                            newParagraph = true;
                        } else if (column.hasTab && isIndented && row.getRight() - row.getXAdjustment()
                                - rightOverlap < column.startTab + safetyMargin) {
                            LOG.debug("New paragraph (indent)");
                            newParagraph = true;
                        } else if (column.hasTab && !isIndented && row.getRight() - row.getXAdjustment()
                                - rightOverlap > column.startMargin - safetyMargin) {
                            LOG.debug("New paragraph (outdent)");
                            newParagraph = true;
                        }
                    } // left-to-right?
                } // have previous row
            } // standalone paragraph?

            if (!rowForStandaloneParagraph)
                LOG.debug(row.toString());

            if (newParagraph) {
                if (rowsForStandaloneParagraphs.size() > 0) {
                    for (RowOfShapes oneRow : rowsForStandaloneParagraphs) {
                        LOG.debug("Standalone paragraph");
                        LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop()
                                + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")");
                        Paragraph standaloneParagraph = sourceImage.newParagraph();
                        standaloneParagraph.getRows().add(oneRow);
                    }
                    rowsForStandaloneParagraphs.clear();
                }
                paragraph = sourceImage.newParagraph();
            }
            //LOG.debug("Row: left(" + row.getLeft() + "), right(" + row.getRight() + "), width(" + (row.getRight() - row.getLeft() + 1) + ")");

            if (!rowForStandaloneParagraph) {
                paragraph.getRows().add(row);
                previousRow = row;
            }
        } // next row in column
        if (rowsForStandaloneParagraphs.size() > 0) {
            for (RowOfShapes oneRow : rowsForStandaloneParagraphs) {
                LOG.debug("Standalone paragraph");
                LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right("
                        + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")");
                Paragraph standaloneParagraph = sourceImage.newParagraph();
                standaloneParagraph.getRows().add(oneRow);
            }
            rowsForStandaloneParagraphs.clear();
        }
    } // next column

}

From source file:org.apache.eagle.service.jpm.suggestion.AbstractGCFunc.java

private double getGcRatio(List<TaskExecutionAPIEntity> tasks) {
    if (tasks.isEmpty()) {
        return 0;
    }/*  w  ww  . jav  a  2 s.  c o m*/
    double[] gcMs = ResourceUtils.getCounterValues(tasks, JobCounters.CounterName.GC_MILLISECONDS);
    double[] cpuMs = ResourceUtils.getCounterValues(tasks, JobCounters.CounterName.CPU_MILLISECONDS);

    DescriptiveStatistics statistics = new DescriptiveStatistics();
    double averageCpuMs = statistics.getMeanImpl().evaluate(cpuMs);
    double averageGcMs = statistics.getMeanImpl().evaluate(gcMs);
    if (averageCpuMs == 0) {
        averageCpuMs = 1;
    }
    return averageGcMs / averageCpuMs;
}

From source file:org.apache.eagle.service.jpm.suggestion.AbstractInputFunc.java

@Override
public JobSuggestionResponse apply(TaskGroupResponse data) {
    MRTaskExecutionResponse.TaskGroup taskGroup = getTasks(data);
    double[] smallerGroup = ResourceUtils.getCounterValues(taskGroup.shortTasks, counterName);
    double[] largerGroup = ResourceUtils.getCounterValues(taskGroup.longTasks, counterName);
    DescriptiveStatistics statistics = new DescriptiveStatistics();
    double avgSmaller = statistics.getMeanImpl().evaluate(smallerGroup);
    double avgLarger = statistics.getMeanImpl().evaluate(largerGroup);

    List<MRTaskExecutionResponse.SuggestionResult> suggestionResults = getDeviationSuggest(avgSmaller,
            avgLarger);//from w w  w .  j  a va 2s.co m
    MRTaskExecutionResponse.JobSuggestionResponse response = new MRTaskExecutionResponse.JobSuggestionResponse();
    response.suggestionResults = suggestionResults;
    response.suggestionType = suggestType.toString();
    return response;
}

From source file:org.apache.jackrabbit.performance.AbstractPerformanceTest.java

private DescriptiveStatistics runTest(AbstractTest test, Repository repository) throws Exception {
    DescriptiveStatistics statistics = new DescriptiveStatistics();

    test.setUp(repository, credentials);
    try {// w w w. jav a2s  .  co m
        // Run a few iterations to warm up the system
        long warmupEnd = System.currentTimeMillis() + warmup * 1000;
        while (System.currentTimeMillis() < warmupEnd) {
            test.execute();
        }

        // Run test iterations, and capture the execution times
        long runtimeEnd = System.currentTimeMillis() + runtime * 1000;
        while (System.currentTimeMillis() < runtimeEnd) {
            statistics.addValue(test.execute());
        }
    } finally {
        test.tearDown();
    }

    return statistics;
}

From source file:org.apache.sling.junit.performance.listener.StatisticsListener.java

@Override
public void executionStarted(String className, String testName) throws Exception {
    statistics = new DescriptiveStatistics();
}