Example usage for org.apache.commons.math.stat.descriptive DescriptiveStatistics getMean

Introduction

In this page you can find the example usage for org.apache.commons.math.stat.descriptive DescriptiveStatistics getMean.

Prototype

public double getMean()

Source Link

Document

Returns the <a href="http://www.xycoon.com/arithmetic_mean.htm"> arithmetic mean </a> of the available values

Usage

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * Detects paragraph splits and assign rows to correct paragraphs.
 * @param sourceImage/*ww w  .j  a  va 2s  .  c  om*/
 */
void groupRowsIntoParagraphs(SourceImage sourceImage) {
    LOG.debug("########## groupRowsIntoParagraphs #########");
    // We'll use various possible indicators, including
    // indented start, indented end, and spacing between rows.

    // On pages with a single big paragraph makes it hypersensitive to differences in row-start/row-end
    // This means we cannot use deviation. Instead, we use the average shape width on the page.
    // We also adjust maxLeft & minRight to match the vertical line slope

    // This is now complicated by the possibility of multiple columns

    // Need to take into account a big horizontal space - Pietrushka page 14
    // Find horizontal spaces that go all the way across and are wider than a certain threshold
    // simply do a boolean column and black out everything in a row, than see if there are any remaining spaces above a certain threshold
    // Columns are thus arranged into "areas", separated by white-space.
    boolean[] fullRows = new boolean[sourceImage.getHeight()];
    for (RowOfShapes row : sourceImage.getRows()) {
        for (int y = row.getTop(); y <= row.getBottom(); y++) {
            fullRows[y] = true;
        }
    }
    DescriptiveStatistics rowHeightStats = new DescriptiveStatistics();

    for (RowOfShapes row : sourceImage.getRows()) {
        int height = row.getXHeight();
        rowHeightStats.addValue(height);
    }
    double avgRowHeight = rowHeightStats.getPercentile(50);
    LOG.debug("meanRowHeight: " + avgRowHeight);
    double minHeightForWhiteSpace = avgRowHeight * 1.3;
    LOG.debug("minHeightForWhiteSpace: " + minHeightForWhiteSpace);

    // find the "white rows" - any horizontal white space
    // in the page which is sufficiently high
    List<int[]> whiteRows = new ArrayList<int[]>();
    boolean inWhite = false;
    int startWhite = 0;
    for (int y = 0; y < sourceImage.getHeight(); y++) {
        if (!inWhite && !fullRows[y]) {
            inWhite = true;
            startWhite = y;
        } else if (inWhite && fullRows[y]) {
            int length = y - startWhite;
            if (length > minHeightForWhiteSpace) {
                LOG.debug("Adding whiteRow " + startWhite + "," + (y - 1));
                whiteRows.add(new int[] { startWhite, y - 1 });
            }
            inWhite = false;
        }
    }
    if (inWhite)
        whiteRows.add(new int[] { startWhite, sourceImage.getHeight() - 1 });
    whiteRows.add(new int[] { sourceImage.getHeight(), sourceImage.getHeight() });

    // place rows in "areas" defined by the "white rows" found above
    List<List<RowOfShapes>> areas = new ArrayList<List<RowOfShapes>>();
    int startY = -1;
    for (int[] whiteRow : whiteRows) {
        List<RowOfShapes> area = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getTop() >= startY && row.getBottom() <= whiteRow[0]) {
                area.add(row);
            }
        }
        if (area.size() > 0) {
            areas.add(area);
        }
        startY = whiteRow[1];
    }

    // break up each area into vertical columns
    LOG.debug("break up each area into vertical columns");
    List<Column> columns = new ArrayList<Column>();
    List<List<Column>> columnsPerAreaList = new ArrayList<List<Column>>();
    for (List<RowOfShapes> area : areas) {
        LOG.debug("Next area");
        List<Column> columnsPerArea = new ArrayList<SegmenterImpl.Column>();
        columnsPerAreaList.add(columnsPerArea);
        TreeSet<RowOfShapes> rows = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator());
        rows.addAll(area);
        for (RowOfShapes row : rows) {
            // try to place this row in one of the columns directly above it.
            // this means that a row which overlaps more than one column has to "close" this column, so it is no longer considered
            List<Column> overlappingColumns = new ArrayList<Column>();
            for (Column column : columnsPerArea) {
                if (!column.closed) {
                    RowOfShapes lastRowInColumn = column.get(column.size() - 1);
                    if (row.getRight() - row.getXAdjustment() >= lastRowInColumn.getLeft()
                            - lastRowInColumn.getXAdjustment()
                            && row.getLeft() - row.getXAdjustment() <= lastRowInColumn.getRight()
                                    - lastRowInColumn.getXAdjustment()) {
                        overlappingColumns.add(column);
                    }
                }
            }
            if (overlappingColumns.size() == 1) {
                Column myColumn = overlappingColumns.get(0);
                RowOfShapes lastRowInMyColumn = myColumn.get(0);

                // close any columns that are now at a distance of more than one row
                for (Column column : columnsPerArea) {
                    if (!column.closed && !column.equals(myColumn)) {
                        RowOfShapes lastRowInColumn = column.get(column.size() - 1);
                        if (lastRowInMyColumn.getTop() > lastRowInColumn.getBottom()) {
                            column.closed = true;
                            LOG.debug("Closing distant column " + lastRowInColumn);
                        }
                    }
                }

                myColumn.add(row);
                LOG.debug(row.toString());
                LOG.debug("  added to column " + lastRowInMyColumn);
            } else {
                for (Column overlappingColumn : overlappingColumns) {
                    overlappingColumn.closed = true;
                    RowOfShapes lastRowInColumn = overlappingColumn.get(overlappingColumn.size() - 1);
                    LOG.debug("Closing overlapping column " + lastRowInColumn);
                }
                Column myColumn = new Column(sourceImage);
                myColumn.add(row);
                LOG.debug("Found new column");
                LOG.debug(row.toString());
                columns.add(myColumn);
                columnsPerArea.add(myColumn);
            }
        }
    } // next area

    for (Column column : columns)
        column.recalculate();

    // Intermediate step to reform the vertical columns, if they exist
    // basically the idea is that if the columns are aligned vertically, then the thresholds for paragraph indents
    // should be shared, to increase the statistical sample size and reduce anomalies.
    // We'll assume that two columns from two consecutive areas are in the same vertical group if they overlap with each other horizontally
    // and don't overlap with any other column in the other column's area.
    List<List<Column>> columnGroups = new ArrayList<List<Column>>();
    List<Column> columnsInPrevArea = null;
    for (List<Column> columnsPerArea : columnsPerAreaList) {
        if (columnsInPrevArea != null) {
            for (Column prevColumn : columnsInPrevArea) {
                LOG.debug("Checking " + prevColumn);
                // find the column group containing the previous column
                List<Column> myColumnGroup = null;
                for (List<Column> columnGroup : columnGroups) {
                    if (columnGroup.contains(prevColumn)) {
                        myColumnGroup = columnGroup;
                        break;
                    }
                }
                if (myColumnGroup == null) {
                    myColumnGroup = new ArrayList<SegmenterImpl.Column>();
                    LOG.debug("Creating column group for column " + prevColumn.toString());
                    columnGroups.add(myColumnGroup);
                    myColumnGroup.add(prevColumn);
                }

                // does only one column overlap with this one?
                Column overlappingColumn = null;
                for (Column column : columnsPerArea) {
                    if (column.adjustedRight >= prevColumn.adjustedLeft
                            && column.adjustedLeft <= prevColumn.adjustedRight) {
                        if (overlappingColumn == null) {
                            LOG.debug("I overlap with " + column);

                            overlappingColumn = column;
                        } else {
                            LOG.debug("But I overlap also with " + column);

                            overlappingColumn = null;
                            break;
                        }
                    }
                }
                if (overlappingColumn != null) {
                    // does it overlap with only me?
                    for (Column otherPrevColumn : columnsInPrevArea) {
                        if (otherPrevColumn.equals(prevColumn))
                            continue;
                        if (overlappingColumn.adjustedRight >= otherPrevColumn.adjustedLeft
                                && overlappingColumn.adjustedLeft <= otherPrevColumn.adjustedRight) {
                            LOG.debug("But it overlaps also with " + otherPrevColumn);
                            overlappingColumn = null;
                            break;
                        }
                    }
                }
                if (overlappingColumn != null) {
                    myColumnGroup.add(overlappingColumn);
                    LOG.debug("Adding " + overlappingColumn);
                    LOG.debug(" to group with " + prevColumn);
                }

            } // next previous column
        } // have previous columns
        columnsInPrevArea = columnsPerArea;
    } // next area
    if (columnsInPrevArea != null) {
        for (Column prevColumn : columnsInPrevArea) {
            // find the column group containing the previous column
            List<Column> myColumnGroup = null;
            for (List<Column> columnGroup : columnGroups) {
                if (columnGroup.contains(prevColumn)) {
                    myColumnGroup = columnGroup;
                    break;
                }
            }
            if (myColumnGroup == null) {
                myColumnGroup = new ArrayList<SegmenterImpl.Column>();
                LOG.debug("Creating column group for column " + prevColumn.toString());
                columnGroups.add(myColumnGroup);
                myColumnGroup.add(prevColumn);
            }
        }
    }

    // What we really want here is, for each column (in the case of right-to-left),
    // two clusters on the right
    // and one relatively big cluster on the left.
    // anything outside of the cluster on the left is an EOP.
    boolean hasTab = false;
    for (List<Column> columnGroup : columnGroups) {
        LOG.debug("Next column group");
        double averageShapeWidth = sourceImage.getAverageShapeWidth();
        LOG.debug("averageShapeWidth: " + averageShapeWidth);
        double epsilon = averageShapeWidth / 2.0;
        LOG.debug("epsilon: " + epsilon);

        int columnGroupTop = sourceImage.getHeight();
        int columnGroupBottom = 0;
        int columnGroupLeft = sourceImage.getWidth();
        int columnGroupRight = 0;
        for (Column column : columnGroup) {
            if (column.top < columnGroupTop)
                columnGroupTop = (int) Math.round(column.top);
            if (column.bottom > columnGroupBottom)
                columnGroupBottom = (int) Math.round(column.bottom);
            if (column.adjustedLeft < columnGroupLeft)
                columnGroupLeft = (int) Math.round(column.adjustedLeft);
            if (column.adjustedRight > columnGroupRight)
                columnGroupRight = (int) Math.round(column.adjustedRight);
        }

        // right thresholds
        LOG.debug("Calculating right thresholds");

        // first, create a DBScan cluster of all rows by their adjusted right coordinate
        List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>();
        List<double[]> rightCoordinates = new ArrayList<double[]>();

        for (Column column : columnGroup) {
            for (RowOfShapes row : column) {
                double right = row.getRight() - row.getXAdjustment();
                //               double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage);
                //               if (rightOverlap==0) {
                //                  // leave out any right-overlapping rows here
                //                  // since we need accurate statistics for margin detection
                //               // This is questionable - especially since a long vertical bar (see Petriushka)
                //               // tends to give all rows a left overlap. Also, because the overlap is calculated based
                //               // on the mean right & mean left, not based on any sort of margin clusters.
                //                  rightHandRows.add(row);
                //                  rightCoordinates.add(new double[] {right});
                //               }
                rightHandRows.add(row);
                rightCoordinates.add(new double[] { right });

            }
        }

        int minCardinalityForRightMargin = 5;
        DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows,
                rightCoordinates);
        Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(epsilon, minCardinalityForRightMargin,
                true);

        TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>(
                new CardinalityComparator<RowOfShapes>());
        orderedRowClusters.addAll(rowClusters);

        int i = 0;

        // find the two right-most clusters, and assume they are the margin & the tab
        DescriptiveStatistics rightMarginStats = null;
        DescriptiveStatistics rightTabStats = null;
        for (Set<RowOfShapes> cluster : orderedRowClusters) {
            DescriptiveStatistics rightStats = new DescriptiveStatistics();
            MeanAbsoluteDeviation rightDev = new MeanAbsoluteDeviation();
            for (RowOfShapes row : cluster) {
                int rowIndex = rightHandRows.indexOf(row);
                double right = rightCoordinates.get(rowIndex)[0];
                rightStats.addValue(right);
                rightDev.increment(right);
            }

            LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
            LOG.debug("Right mean : " + rightStats.getMean());
            LOG.debug("Right dev: " + rightDev.getResult());

            if (cluster.size() >= minCardinalityForRightMargin) {
                if (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean()) {
                    if (rightMarginStats != null)
                        rightTabStats = rightMarginStats;
                    rightMarginStats = rightStats;
                } else if (rightTabStats == null || rightTabStats.getMean() < rightStats.getMean()) {
                    rightTabStats = rightStats;
                }
            } else {
                break;
            }
            i++;
        } // next right-coordinate cluster

        double rightMargin = sourceImage.getWidth();
        double rightTab = sourceImage.getWidth();
        if (rightMarginStats != null) {
            rightMargin = rightMarginStats.getMean();
        } else {
            List<Rectangle> columnSeparators = sourceImage.findColumnSeparators();
            for (Rectangle columnSeparator : columnSeparators) {
                if (columnSeparator.getTop() <= columnGroupTop
                        && columnSeparator.getBottom() >= columnGroupBottom
                        && columnSeparator.getLeft() >= columnGroupRight) {
                    if (columnSeparator.getLeft() < rightMargin)
                        rightMargin = columnSeparator.getLeft();
                }
            }
        }
        if (rightTabStats != null) {
            rightTab = rightTabStats.getMean();
        }

        LOG.debug("rightMargin: " + rightMargin);
        LOG.debug("rightTab: " + rightTab);

        // left thresholds
        LOG.debug("Calculating left thresholds");

        // first, create a DBScan cluster of all rows by their adjusted left coordinate
        List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>();
        List<double[]> leftCoordinates = new ArrayList<double[]>();

        for (Column column : columnGroup) {
            for (RowOfShapes row : column) {
                double left = row.getLeft() - row.getXAdjustment();
                //               double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage);
                //               if (leftOverlap == 0) {
                //                  // leave out any overlapping rows from margin calcs,
                //                  // since we need accurate statistics here
                //                  leftHandRows.add(row);
                //                  leftCoordinates.add(new double[] {left});
                //               }
                leftHandRows.add(row);
                leftCoordinates.add(new double[] { left });
            }
        }

        int minCardinalityForLeftMargin = 5;
        DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows,
                leftCoordinates);
        Set<Set<RowOfShapes>> leftRowClusters = leftMarginClusterer.cluster(epsilon,
                minCardinalityForLeftMargin, true);

        TreeSet<Set<RowOfShapes>> orderedLeftRowClusters = new TreeSet<Set<RowOfShapes>>(
                new CardinalityComparator<RowOfShapes>());
        orderedLeftRowClusters.addAll(leftRowClusters);

        i = 0;

        // find the two left-most clusters, and assume they are the margin & the tab
        DescriptiveStatistics leftMarginStats = null;
        DescriptiveStatistics leftTabStats = null;
        for (Set<RowOfShapes> cluster : orderedLeftRowClusters) {
            DescriptiveStatistics leftStats = new DescriptiveStatistics();
            MeanAbsoluteDeviation leftDev = new MeanAbsoluteDeviation();
            for (RowOfShapes row : cluster) {
                int rowIndex = leftHandRows.indexOf(row);
                double left = leftCoordinates.get(rowIndex)[0];
                leftStats.addValue(left);
                leftDev.increment(left);
            }

            LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
            LOG.debug("Left mean : " + leftStats.getMean());
            LOG.debug("Left dev: " + leftDev.getResult());

            if (cluster.size() >= minCardinalityForLeftMargin) {
                if (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean()) {
                    if (leftMarginStats != null)
                        leftTabStats = leftMarginStats;
                    leftMarginStats = leftStats;
                } else if (leftTabStats == null || leftTabStats.getMean() > leftStats.getMean()) {
                    leftTabStats = leftStats;
                }
            } else {
                break;
            }
            i++;
        } // next left-coordinate cluster

        double leftMargin = 0;
        double leftTab = 0;
        if (leftMarginStats != null) {
            leftMargin = leftMarginStats.getMean();
        } else {
            List<Rectangle> columnSeparators = sourceImage.findColumnSeparators();
            for (Rectangle columnSeparator : columnSeparators) {
                if (columnSeparator.getTop() <= columnGroupTop
                        && columnSeparator.getBottom() >= columnGroupBottom
                        && columnSeparator.getRight() <= columnGroupLeft) {
                    if (columnSeparator.getRight() > leftMargin)
                        leftMargin = columnSeparator.getRight();
                }
            }
        }
        if (leftTabStats != null) {
            leftTab = leftTabStats.getMean();
        }

        LOG.debug("leftMargin: " + leftMargin);
        LOG.debug("leftTab: " + leftTab);

        for (Column column : columnGroup) {
            if (sourceImage.isLeftToRight()) {
                column.startMargin = leftMargin;
                if (leftTabStats != null) {
                    column.startTab = leftTab;
                    column.hasTab = true;
                } else {
                    LOG.debug("No left tab - setting based on left margin");
                    column.startTab = leftMargin + (5.0 * sourceImage.getAverageShapeWidth());
                    column.hasTab = false;
                }

                column.endMargin = rightMargin;
            } else {
                column.startMargin = rightMargin;
                if (rightTabStats != null) {
                    column.startTab = rightTab;
                    column.hasTab = true;
                } else {
                    LOG.debug("No right tab - setting based on right margin");
                    column.startTab = rightMargin - (5.0 * sourceImage.getAverageShapeWidth());
                    column.hasTab = false;
                }

                column.endMargin = leftMargin;
            }
            LOG.debug("Margins for " + column);
            LOG.debug("startMargin: " + column.startMargin);
            LOG.debug("startTab: " + column.startTab);
            LOG.debug("endMargin: " + column.endMargin);
        } // next column
    } // next column group
    LOG.debug("hasTab: " + hasTab);

    double safetyMargin = 1.5 * sourceImage.getAverageShapeWidth();

    // Now, paragraphs are either "indented", "outdented" or not "dented" at all (no tabs).
    // This applies to the entire page.
    // To recognise indenting vs. outdenting, we have to see if the row preceding each
    // indent/outdent is full or partial. In the case of indentation, partial rows will
    // typically be followed by an indent. In the case of outdentation, partial rows will
    // typically be followed by an outdent.
    boolean isIndented = true;

    int indentCount = 0;
    int outdentCount = 0;
    for (List<Column> columnGroup : columnGroups) {
        LOG.debug("Next column group");
        boolean prevRowPartial = false;
        for (Column column : columnGroup) {
            if (column.hasTab) {
                for (RowOfShapes row : column) {
                    if (sourceImage.isLeftToRight()) {
                        if (prevRowPartial) {
                            if (row.getLeft() - row.getXAdjustment() > column.startTab - safetyMargin) {
                                indentCount++;
                            } else if (row.getLeft() - row.getXAdjustment() < column.startMargin
                                    + safetyMargin) {
                                outdentCount++;
                            }
                        }
                        if (row.getRight() - row.getXAdjustment() < column.endMargin - safetyMargin) {
                            prevRowPartial = true;
                        } else {
                            prevRowPartial = false;
                        }
                    } else {
                        if (prevRowPartial) {
                            if (row.getRight() - row.getXAdjustment() < column.startTab + safetyMargin) {
                                indentCount++;
                            } else if (row.getRight() - row.getXAdjustment() > column.startMargin
                                    - safetyMargin) {
                                outdentCount++;
                            }
                        }
                        if (row.getLeft() - row.getXAdjustment() > column.endMargin + safetyMargin) {
                            prevRowPartial = true;
                        } else {
                            prevRowPartial = false;
                        }
                    } // left-to-right?
                } // next row  
            } // column has tab
        } // next column
    } // next column group
    isIndented = (indentCount + 2 >= outdentCount);
    LOG.debug("indentCount: " + indentCount);
    LOG.debug("outdentCount: " + outdentCount);
    LOG.debug("isIndented: " + isIndented);

    // order the columns
    TreeSet<Column> orderedColumns = new TreeSet<SegmenterImpl.Column>(columns);
    columns.clear();
    columns.addAll(orderedColumns);

    // find the paragraphs found in each column
    for (Column column : columns) {
        LOG.debug("--- Next column ---");

        // break up the column into paragraphs 
        Paragraph paragraph = null;
        RowOfShapes previousRow = null;
        int maxShapesForStandaloneParagraph = 2;
        List<RowOfShapes> rowsForStandaloneParagraphs = new ArrayList<RowOfShapes>();
        Point2D previousPointStartMargin = null;
        Point2D previousPointStartTab = null;
        Point2D previousPointEndMargin = null;

        for (RowOfShapes row : column) {
            boolean rowForStandaloneParagraph = false;
            boolean newParagraph = false;
            if (row.getShapes().size() <= maxShapesForStandaloneParagraph) {
                rowsForStandaloneParagraphs.add(row);
                rowForStandaloneParagraph = true;
            } else {
                double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage);
                double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage);

                if (drawSegmentation) {
                    double rowVerticalMidPoint = row.getBaseLineMiddlePoint();
                    double startMarginX = column.startMargin + row.getXAdjustment();
                    double startTabX = column.startTab + row.getXAdjustment();
                    double endMarginX = column.endMargin + row.getXAdjustment();

                    if (sourceImage.isLeftToRight()) {
                        startMarginX += safetyMargin;
                        startTabX -= safetyMargin;
                        endMarginX -= safetyMargin;

                        startMarginX += leftOverlap;
                        startTabX += leftOverlap;
                        endMarginX -= rightOverlap;
                    } else {
                        startMarginX -= safetyMargin;
                        startTabX += safetyMargin;
                        endMarginX += safetyMargin;

                        startMarginX -= rightOverlap;
                        startTabX -= rightOverlap;
                        endMarginX += leftOverlap;
                    }

                    Point2D.Double currentPointStartMargin = new Point2D.Double(startMarginX,
                            rowVerticalMidPoint);
                    Point2D.Double currentPointStartTab = new Point2D.Double(startTabX, rowVerticalMidPoint);
                    Point2D.Double currentPointEndMargin = new Point2D.Double(endMarginX, rowVerticalMidPoint);

                    if (previousPointStartMargin != null) {
                        graphics2D.setStroke(new BasicStroke(1));
                        graphics2D.setPaint(Color.BLUE);
                        graphics2D.drawLine((int) Math.round(previousPointStartMargin.getX()),
                                (int) Math.round(previousPointStartMargin.getY()),
                                (int) Math.round(currentPointStartMargin.getX()),
                                (int) Math.round(currentPointStartMargin.getY()));
                        graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()),
                                (int) Math.round(previousPointEndMargin.getY()),
                                (int) Math.round(currentPointEndMargin.getX()),
                                (int) Math.round(currentPointEndMargin.getY()));

                        graphics2D.setPaint(Color.RED);
                        graphics2D.drawLine((int) Math.round(previousPointStartTab.getX()),
                                (int) Math.round(previousPointStartTab.getY()),
                                (int) Math.round(currentPointStartTab.getX()),
                                (int) Math.round(currentPointStartTab.getY()));

                        graphics2D.setPaint(Color.RED);
                        graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()),
                                (int) Math.round(previousPointEndMargin.getY()),
                                (int) Math.round(currentPointEndMargin.getX()),
                                (int) Math.round(currentPointEndMargin.getY()));
                    }
                    previousPointStartMargin = currentPointStartMargin;
                    previousPointStartTab = currentPointStartTab;
                    previousPointEndMargin = currentPointEndMargin;
                }

                if (previousRow == null) {
                    LOG.debug("New paragraph (first)");
                    newParagraph = true;
                } else {
                    if (sourceImage.isLeftToRight()) {
                        if (previousRow.getRight() - previousRow.getXAdjustment()
                                - rightOverlap < column.endMargin - safetyMargin) {
                            LOG.debug("New paragraph (previous EOP)");
                            newParagraph = true;
                        } else if (column.hasTab && isIndented && row.getLeft() - row.getXAdjustment()
                                + leftOverlap > column.startTab - safetyMargin) {
                            LOG.debug("New paragraph (indent)");
                            newParagraph = true;
                        } else if (column.hasTab && !isIndented && row.getLeft() - row.getXAdjustment()
                                + leftOverlap < column.startMargin + safetyMargin) {
                            LOG.debug("New paragraph (outdent)");
                            newParagraph = true;
                        }
                    } else {
                        if (previousRow.getLeft() - previousRow.getXAdjustment()
                                + leftOverlap > column.endMargin + safetyMargin) {
                            LOG.debug("New paragraph (previous EOP)");
                            newParagraph = true;
                        } else if (column.hasTab && isIndented && row.getRight() - row.getXAdjustment()
                                - rightOverlap < column.startTab + safetyMargin) {
                            LOG.debug("New paragraph (indent)");
                            newParagraph = true;
                        } else if (column.hasTab && !isIndented && row.getRight() - row.getXAdjustment()
                                - rightOverlap > column.startMargin - safetyMargin) {
                            LOG.debug("New paragraph (outdent)");
                            newParagraph = true;
                        }
                    } // left-to-right?
                } // have previous row
            } // standalone paragraph?

            if (!rowForStandaloneParagraph)
                LOG.debug(row.toString());

            if (newParagraph) {
                if (rowsForStandaloneParagraphs.size() > 0) {
                    for (RowOfShapes oneRow : rowsForStandaloneParagraphs) {
                        LOG.debug("Standalone paragraph");
                        LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop()
                                + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")");
                        Paragraph standaloneParagraph = sourceImage.newParagraph();
                        standaloneParagraph.getRows().add(oneRow);
                    }
                    rowsForStandaloneParagraphs.clear();
                }
                paragraph = sourceImage.newParagraph();
            }
            //LOG.debug("Row: left(" + row.getLeft() + "), right(" + row.getRight() + "), width(" + (row.getRight() - row.getLeft() + 1) + ")");

            if (!rowForStandaloneParagraph) {
                paragraph.getRows().add(row);
                previousRow = row;
            }
        } // next row in column
        if (rowsForStandaloneParagraphs.size() > 0) {
            for (RowOfShapes oneRow : rowsForStandaloneParagraphs) {
                LOG.debug("Standalone paragraph");
                LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right("
                        + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")");
                Paragraph standaloneParagraph = sourceImage.newParagraph();
                standaloneParagraph.getRows().add(oneRow);
            }
            rowsForStandaloneParagraphs.clear();
        }
    } // next column

}

From source file:com.joliciel.csvLearner.CSVLearner.java

private void doCommandEvaluate() throws IOException {
    if (resultFilePath == null)
        throw new RuntimeException("Missing argument: resultFile");
    if (featureDir == null)
        throw new RuntimeException("Missing argument: featureDir");
    if (testIdFilePath != null) {
        if (crossValidation)
            throw new RuntimeException("Cannot combine testIdFile with cross validation");
        if (testSegment >= 0) {
            throw new RuntimeException("Cannot combine testIdFile with test segment");
        }/*  w ww . j  a  v  a2 s.  com*/
    }
    if (!crossValidation && testIdFilePath == null) {
        if (testSegment < 0)
            throw new RuntimeException("Missing argument: testSegment");
        if (testSegment > 9)
            throw new RuntimeException("testSegment must be an integer between 0 and 9");
    }
    if (outDirPath == null)
        throw new RuntimeException("Missing argument: outDir");

    LOG.info("Generating event list from CSV files...");
    CSVEventListReader reader = this.getReader(TrainingSetType.TEST_SEGMENT, false);

    GenericEvents events = reader.getEvents();

    File outDir = new File(outDirPath);
    outDir.mkdirs();
    String fileBase = this.featureDir.replace('/', '_');
    fileBase = fileBase.replace(':', '_');
    fileBase = fileBase + "_cutoff" + cutoff;

    if (generateEventFile) {
        File eventFile = new File(outDir, fileBase + "_events.txt");
        this.generateEventFile(eventFile, events);
    }

    File fscoreFile = new File(outDir, fileBase + "_fscores.csv");
    Writer fscoreFileWriter = new BufferedWriter(
            new OutputStreamWriter(new FileOutputStream(fscoreFile, false), "UTF8"));

    File outcomeFile = new File(outDir, fileBase + "_outcomes.csv");
    Writer outcomeFileWriter = new BufferedWriter(
            new OutputStreamWriter(new FileOutputStream(outcomeFile, false), "UTF8"));

    try {
        if (!crossValidation) {
            MaxentModel maxentModel = this.train(events, null);

            this.evaluate(maxentModel, events, fscoreFileWriter, outcomeFileWriter);
        } else {
            DescriptiveStatistics accuracyStats = new DescriptiveStatistics();
            Map<String, DescriptiveStatistics[]> outcomeFscoreStats = new TreeMap<String, DescriptiveStatistics[]>();
            for (int segment = 0; segment <= 9; segment++) {
                outcomeFileWriter.write("Run " + segment + ",\n");
                fscoreFileWriter.write("Run " + segment + ",\n");
                if (balanceOutcomes) {
                    for (String outcome : reader.getOutcomes()) {
                        int i = 0;
                        for (GenericEvent event : events) {
                            if (event.getOutcome().equals(outcome)) {
                                boolean test = i % 10 == segment;
                                event.setTest(test);
                                i++;
                            }
                        }
                    }
                } else {
                    int i = 0;
                    for (GenericEvent event : events) {
                        boolean test = i % 10 == segment;
                        event.setTest(test);
                        i++;
                    }
                }

                MaxentModel maxentModel = this.train(events, null);
                FScoreCalculator<String> fscoreCalculator = this.evaluate(maxentModel, events, fscoreFileWriter,
                        outcomeFileWriter);

                accuracyStats.addValue(fscoreCalculator.getTotalFScore());
                for (String outcome : fscoreCalculator.getOutcomeSet()) {
                    DescriptiveStatistics[] stats = outcomeFscoreStats.get(outcome);
                    if (stats == null) {
                        stats = new DescriptiveStatistics[3];
                        stats[0] = new DescriptiveStatistics();
                        stats[1] = new DescriptiveStatistics();
                        stats[2] = new DescriptiveStatistics();
                        outcomeFscoreStats.put(outcome, stats);
                    }
                    stats[0].addValue(fscoreCalculator.getPrecision(outcome));
                    stats[1].addValue(fscoreCalculator.getRecall(outcome));
                    stats[2].addValue(fscoreCalculator.getFScore(outcome));
                } // next outcome

                outcomeFileWriter.write("\n");

            } // next segment

            fscoreFileWriter.write(
                    "outcome,precision avg., precision dev., recall avg., recall dev., f-score avg., f-score dev.,\n");
            for (String outcome : outcomeFscoreStats.keySet()) {
                DescriptiveStatistics[] stats = outcomeFscoreStats.get(outcome);
                fscoreFileWriter
                        .write(CSVFormatter.format(outcome) + "," + CSVFormatter.format(stats[0].getMean())
                                + "," + CSVFormatter.format(stats[0].getStandardDeviation()) + ","
                                + CSVFormatter.format(stats[1].getMean()) + ","
                                + CSVFormatter.format(stats[1].getStandardDeviation()) + ","
                                + CSVFormatter.format(stats[2].getMean()) + ","
                                + CSVFormatter.format(stats[2].getStandardDeviation()) + "," + "\n");
            }
            fscoreFileWriter.write("TOTAL,,,,," + CSVFormatter.format(accuracyStats.getMean()) + ","
                    + CSVFormatter.format(accuracyStats.getStandardDeviation()) + ",\n");

            LOG.info("Accuracy mean: " + accuracyStats.getMean());
            LOG.info("Accuracy std dev: " + accuracyStats.getStandardDeviation());
        }
    } finally {
        fscoreFileWriter.flush();
        fscoreFileWriter.close();
        outcomeFileWriter.flush();
        outcomeFileWriter.close();
    }

    LOG.info("#### Complete ####");
}

From source file:net.sf.mzmine.modules.peaklistmethods.dataanalysis.heatmaps.HeatMapTask.java

private double[][] groupingDataset(UserParameter selectedParameter, String referenceGroup) {
    // Collect all data files
    Vector<RawDataFile> allDataFiles = new Vector<RawDataFile>();
    DescriptiveStatistics meanControlStats = new DescriptiveStatistics();
    DescriptiveStatistics meanGroupStats = new DescriptiveStatistics();
    allDataFiles.addAll(Arrays.asList(peakList.getRawDataFiles()));

    // Determine the reference group and non reference group (the rest of
    // the samples) for raw data files
    List<RawDataFile> referenceDataFiles = new ArrayList<RawDataFile>();
    List<RawDataFile> nonReferenceDataFiles = new ArrayList<RawDataFile>();

    List<String> groups = new ArrayList<String>();
    MZmineProject project = MZmineCore.getCurrentProject();

    for (RawDataFile rawDataFile : allDataFiles) {

        Object paramValue = project.getParameterValue(selectedParameter, rawDataFile);
        if (!groups.contains(String.valueOf(paramValue))) {
            groups.add(String.valueOf(paramValue));
        }//w w  w. java 2  s  .co m
        if (String.valueOf(paramValue).equals(referenceGroup)) {

            referenceDataFiles.add(rawDataFile);
        } else {

            nonReferenceDataFiles.add(rawDataFile);
        }
    }

    int numRows = 0;
    for (int row = 0; row < peakList.getNumberOfRows(); row++) {

        if (!onlyIdentified || (onlyIdentified && peakList.getRow(row).getPeakIdentities().length > 0)) {
            numRows++;
        }
    }

    // Create a new aligned peak list with all the samples if the reference
    // group has to be shown or with only
    // the non reference group if not.
    double[][] dataMatrix = new double[groups.size() - 1][numRows];
    pValueMatrix = new String[groups.size() - 1][numRows];

    // data files that should be in the heat map
    List<RawDataFile> shownDataFiles = nonReferenceDataFiles;

    for (int row = 0, rowIndex = 0; row < peakList.getNumberOfRows(); row++) {
        PeakListRow rowPeak = peakList.getRow(row);
        if (!onlyIdentified || (onlyIdentified && rowPeak.getPeakIdentities().length > 0)) {
            // Average area or height of the reference group
            meanControlStats.clear();
            for (int column = 0; column < referenceDataFiles.size(); column++) {

                if (rowPeak.getPeak(referenceDataFiles.get(column)) != null) {

                    if (area) {

                        meanControlStats.addValue(rowPeak.getPeak(referenceDataFiles.get(column)).getArea());
                    } else {

                        meanControlStats.addValue(rowPeak.getPeak(referenceDataFiles.get(column)).getHeight());
                    }

                }
            }

            // Divide the area or height of each peak by the average of the
            // area or height of the reference peaks in each row
            int columnIndex = 0;
            for (int column = 0; column < groups.size(); column++) {
                String group = groups.get(column);
                meanGroupStats.clear();
                if (!group.equals(referenceGroup)) {

                    for (int dataColumn = 0; dataColumn < shownDataFiles.size(); dataColumn++) {

                        Object paramValue = project.getParameterValue(selectedParameter,
                                shownDataFiles.get(dataColumn));
                        if (rowPeak.getPeak(shownDataFiles.get(dataColumn)) != null
                                && String.valueOf(paramValue).equals(group)) {

                            Feature peak = rowPeak.getPeak(shownDataFiles.get(dataColumn));

                            if (!Double.isInfinite(peak.getArea()) && !Double.isNaN(peak.getArea())) {

                                if (area) {

                                    meanGroupStats.addValue(peak.getArea());
                                } else {

                                    meanGroupStats.addValue(peak.getHeight());
                                }
                            }

                        }
                    }

                    double value = meanGroupStats.getMean() / meanControlStats.getMean();
                    if (meanGroupStats.getN() > 1 && meanControlStats.getN() > 1) {
                        pValueMatrix[columnIndex][rowIndex] = this.getPvalue(meanGroupStats, meanControlStats);
                    } else {
                        pValueMatrix[columnIndex][rowIndex] = "";
                    }

                    if (log) {

                        value = Math.log(value);
                    }
                    dataMatrix[columnIndex++][rowIndex] = value;
                }
            }
            rowIndex++;
        }
    }

    // Scale the data dividing the peak area/height by the standard
    // deviation of each column
    if (scale) {
        scale(dataMatrix);
    }

    // Create two arrays: row and column names
    rowNames = new String[dataMatrix[0].length];
    colNames = new String[groups.size() - 1];

    int columnIndex = 0;
    for (String group : groups) {

        if (!group.equals(referenceGroup)) {

            colNames[columnIndex++] = group;
        }
    }
    for (int row = 0, rowIndex = 0; row < peakList.getNumberOfRows(); row++) {
        if (!onlyIdentified || (onlyIdentified && peakList.getRow(row).getPeakIdentities().length > 0)) {
            if (peakList.getRow(row).getPeakIdentities() != null
                    && peakList.getRow(row).getPeakIdentities().length > 0) {

                rowNames[rowIndex++] = peakList.getRow(row).getPreferredPeakIdentity().getName();
            } else {

                rowNames[rowIndex++] = "Unknown";
            }
        }
    }

    return dataMatrix;
}

From source file:datafu.hourglass.jobs.StagedOutputJob.java

/**
 * Writes Hadoop counters and other task statistics to a file in the file system.
 * /*from w  w w. j ava2 s .co  m*/
 * @param fs
 * @throws IOException
 */
private void writeCounters(final FileSystem fs) throws IOException {
    final Path actualOutputPath = FileOutputFormat.getOutputPath(this);

    SimpleDateFormat timestampFormat = new SimpleDateFormat("yyyyMMddHHmmss");

    String suffix = timestampFormat.format(new Date());

    if (_countersParentPath != null) {
        if (!fs.exists(_countersParentPath)) {
            _log.info("Creating counter parent path " + _countersParentPath);
            fs.mkdirs(_countersParentPath, FsPermission.valueOf("-rwxrwxr-x"));
        }
        // make the name as unique as possible in this case because this may be a directory
        // where other counter files will be dropped
        _countersPath = new Path(_countersParentPath, ".counters." + suffix);
    } else {
        _countersPath = new Path(actualOutputPath, ".counters." + suffix);
    }

    _log.info(String.format("Writing counters to %s", _countersPath));
    FSDataOutputStream counterStream = fs.create(_countersPath);
    BufferedOutputStream buffer = new BufferedOutputStream(counterStream, 256 * 1024);
    OutputStreamWriter writer = new OutputStreamWriter(buffer);
    for (String groupName : getCounters().getGroupNames()) {
        for (Counter counter : getCounters().getGroup(groupName)) {
            writeAndLog(writer, String.format("%s=%d", counter.getName(), counter.getValue()));
        }
    }

    JobID jobID = this.getJobID();

    org.apache.hadoop.mapred.JobID oldJobId = new org.apache.hadoop.mapred.JobID(jobID.getJtIdentifier(),
            jobID.getId());

    long minStart = Long.MAX_VALUE;
    long maxFinish = 0;
    long setupStart = Long.MAX_VALUE;
    long cleanupFinish = 0;
    DescriptiveStatistics mapStats = new DescriptiveStatistics();
    DescriptiveStatistics reduceStats = new DescriptiveStatistics();
    boolean success = true;

    JobClient jobClient = new JobClient(this.conf);

    Map<String, String> taskIdToType = new HashMap<String, String>();

    TaskReport[] setupReports = jobClient.getSetupTaskReports(oldJobId);
    if (setupReports.length > 0) {
        _log.info("Processing setup reports");
        for (TaskReport report : jobClient.getSetupTaskReports(oldJobId)) {
            taskIdToType.put(report.getTaskID().toString(), "SETUP");
            if (report.getStartTime() == 0) {
                _log.warn("Skipping report with zero start time");
                continue;
            }
            setupStart = Math.min(setupStart, report.getStartTime());
        }
    } else {
        _log.error("No setup reports");
    }

    TaskReport[] mapReports = jobClient.getMapTaskReports(oldJobId);
    if (mapReports.length > 0) {
        _log.info("Processing map reports");
        for (TaskReport report : mapReports) {
            taskIdToType.put(report.getTaskID().toString(), "MAP");
            if (report.getFinishTime() == 0 || report.getStartTime() == 0) {
                _log.warn("Skipping report with zero start or finish time");
                continue;
            }
            minStart = Math.min(minStart, report.getStartTime());
            mapStats.addValue(report.getFinishTime() - report.getStartTime());
        }
    } else {
        _log.error("No map reports");
    }

    TaskReport[] reduceReports = jobClient.getReduceTaskReports(oldJobId);
    if (reduceReports.length > 0) {
        _log.info("Processing reduce reports");
        for (TaskReport report : reduceReports) {
            taskIdToType.put(report.getTaskID().toString(), "REDUCE");
            if (report.getFinishTime() == 0 || report.getStartTime() == 0) {
                _log.warn("Skipping report with zero start or finish time");
                continue;
            }
            maxFinish = Math.max(maxFinish, report.getFinishTime());
            reduceStats.addValue(report.getFinishTime() - report.getStartTime());
        }
    } else {
        _log.error("No reduce reports");
    }

    TaskReport[] cleanupReports = jobClient.getCleanupTaskReports(oldJobId);
    if (cleanupReports.length > 0) {
        _log.info("Processing cleanup reports");
        for (TaskReport report : cleanupReports) {
            taskIdToType.put(report.getTaskID().toString(), "CLEANUP");
            if (report.getFinishTime() == 0) {
                _log.warn("Skipping report with finish time of zero");
                continue;
            }
            cleanupFinish = Math.max(cleanupFinish, report.getFinishTime());
        }
    } else {
        _log.error("No cleanup reports");
    }

    if (minStart == Long.MAX_VALUE) {
        _log.error("Could not determine map-reduce start time");
        success = false;
    }
    if (maxFinish == 0) {
        _log.error("Could not determine map-reduce finish time");
        success = false;
    }

    if (setupStart == Long.MAX_VALUE) {
        _log.error("Could not determine setup start time");
        success = false;
    }
    if (cleanupFinish == 0) {
        _log.error("Could not determine cleanup finish time");
        success = false;
    }

    // Collect statistics on successful/failed/killed task attempts, categorized by setup/map/reduce/cleanup.
    // Unfortunately the job client doesn't have an easier way to get these statistics.
    Map<String, Integer> attemptStats = new HashMap<String, Integer>();
    _log.info("Processing task attempts");
    for (TaskCompletionEvent event : getTaskCompletionEvents(jobClient, oldJobId)) {
        String type = taskIdToType.get(event.getTaskAttemptId().getTaskID().toString());
        String status = event.getTaskStatus().toString();

        String key = String.format("%s_%s_ATTEMPTS", status, type);
        if (!attemptStats.containsKey(key)) {
            attemptStats.put(key, 0);
        }
        attemptStats.put(key, attemptStats.get(key) + 1);
    }

    if (success) {
        writeAndLog(writer, String.format("SETUP_START_TIME_MS=%d", setupStart));
        writeAndLog(writer, String.format("CLEANUP_FINISH_TIME_MS=%d", cleanupFinish));
        writeAndLog(writer, String.format("COMPLETE_WALL_CLOCK_TIME_MS=%d", cleanupFinish - setupStart));

        writeAndLog(writer, String.format("MAP_REDUCE_START_TIME_MS=%d", minStart));
        writeAndLog(writer, String.format("MAP_REDUCE_FINISH_TIME_MS=%d", maxFinish));
        writeAndLog(writer, String.format("MAP_REDUCE_WALL_CLOCK_TIME_MS=%d", maxFinish - minStart));

        writeAndLog(writer, String.format("MAP_TOTAL_TASKS=%d", (long) mapStats.getN()));
        writeAndLog(writer, String.format("MAP_MAX_TIME_MS=%d", (long) mapStats.getMax()));
        writeAndLog(writer, String.format("MAP_MIN_TIME_MS=%d", (long) mapStats.getMin()));
        writeAndLog(writer, String.format("MAP_AVG_TIME_MS=%d", (long) mapStats.getMean()));
        writeAndLog(writer, String.format("MAP_STD_TIME_MS=%d", (long) mapStats.getStandardDeviation()));
        writeAndLog(writer, String.format("MAP_SUM_TIME_MS=%d", (long) mapStats.getSum()));

        writeAndLog(writer, String.format("REDUCE_TOTAL_TASKS=%d", (long) reduceStats.getN()));
        writeAndLog(writer, String.format("REDUCE_MAX_TIME_MS=%d", (long) reduceStats.getMax()));
        writeAndLog(writer, String.format("REDUCE_MIN_TIME_MS=%d", (long) reduceStats.getMin()));
        writeAndLog(writer, String.format("REDUCE_AVG_TIME_MS=%d", (long) reduceStats.getMean()));
        writeAndLog(writer, String.format("REDUCE_STD_TIME_MS=%d", (long) reduceStats.getStandardDeviation()));
        writeAndLog(writer, String.format("REDUCE_SUM_TIME_MS=%d", (long) reduceStats.getSum()));

        writeAndLog(writer, String.format("MAP_REDUCE_SUM_TIME_MS=%d",
                (long) mapStats.getSum() + (long) reduceStats.getSum()));

        for (Map.Entry<String, Integer> attemptStat : attemptStats.entrySet()) {
            writeAndLog(writer, String.format("%s=%d", attemptStat.getKey(), attemptStat.getValue()));
        }
    }

    writer.close();
    buffer.close();
    counterStream.close();
}

From source file:de.tudarmstadt.ukp.experiments.argumentation.sequence.feature.coreference.CoreferenceFeatures.java

@Override
protected List<Feature> extract(JCas jCas, Sentence sentence, String sentencePrefix)
        throws TextClassificationException {
    List<List<CoreferenceLink>> coreferenceChains = extractCoreferenceChains(jCas);

    FrequencyDistribution<String> featuresAcrossAllChains = new FrequencyDistribution<>();
    DescriptiveStatistics chainLength = new DescriptiveStatistics();
    DescriptiveStatistics distanceToPreviousSentence = new DescriptiveStatistics();
    DescriptiveStatistics distanceToNextSentence = new DescriptiveStatistics();
    DescriptiveStatistics interSentencesCorLinks = new DescriptiveStatistics();

    for (List<CoreferenceLink> chain : coreferenceChains) {

        SortedMap<Integer, List<CoreferenceLink>> sentencesAndLinks = extractSentencesAndLinksFromChain(chain,
                jCas);//  www .  j av a 2  s . co m

        int currentSentencePos = getCurrentSentencePos(jCas, sentence);

        log.debug(sentencesAndLinks.keySet() + ", current " + currentSentencePos);

        // is the sentence in chain that spans more sentences?
        boolean partOfChain = sentencesAndLinks.containsKey(currentSentencePos) && sentencesAndLinks.size() > 1;

        // is part of a chain?
        if (partOfChain) {
            log.debug(chainToString(chain));
            featuresAcrossAllChains.inc(FN_PART_OF_CHAIN);

            // starts the chain?
            if (sentencesAndLinks.firstKey().equals(currentSentencePos)) {
                featuresAcrossAllChains.inc(FN_STARTS_THE_CHAIN);
            } else if (sentencesAndLinks.lastKey().equals(currentSentencePos)) {
                // ends the chain?
                featuresAcrossAllChains.inc(FN_ENDS_THE_CHAIN);
            } else {
                // in the middle of chain?
                featuresAcrossAllChains.inc(FN_IN_THE_MIDDLE_OF_CHAIN);
            }

            // length of the chain
            chainLength.addValue(sentencesAndLinks.size());

            List<CoreferenceLink> currentSentenceLinks = sentencesAndLinks.get(currentSentencePos);
            CoreferenceLink currentSentenceFirstLink = currentSentenceLinks.get(0);
            CoreferenceLink currentSentenceLastLink = currentSentenceLinks.get(currentSentenceLinks.size() - 1);

            // transition to the previous link, i.e. NOMINAL -> PRONOMINAL
            if (!sentencesAndLinks.firstKey().equals(currentSentencePos)) {
                // find the previous sentence
                List<CoreferenceLink> previousSentenceLinks = null;
                int prevSentNo = currentSentencePos;
                while (previousSentenceLinks == null && prevSentNo >= 0) {
                    prevSentNo--;

                    if (sentencesAndLinks.containsKey(prevSentNo)) {
                        previousSentenceLinks = sentencesAndLinks.get(prevSentNo);
                    }
                }

                if (previousSentenceLinks == null) {
                    throw new IllegalStateException("Oops :))");
                }

                // distance to previous sentence
                distanceToPreviousSentence.addValue(currentSentencePos - prevSentNo);

                // get the last link from the previous sentence
                CoreferenceLink prevSentenceLastLink = previousSentenceLinks
                        .get(previousSentenceLinks.size() - 1);

                // add type type transition
                String prevSentenceLastLinkReferenceType = prevSentenceLastLink.getReferenceType();
                String currentSentenceFirstLinkReferenceType = currentSentenceFirstLink.getReferenceType();
                String transitionType = prevSentenceLastLinkReferenceType + GLUE
                        + currentSentenceFirstLinkReferenceType;
                featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TYPE_TYPE + transitionType, 1);

                // add token - type transition
                String glueCoreferenceCurrentSentence = glueCoreferenceLinkTokens(currentSentenceFirstLink);
                String typeToken = prevSentenceLastLinkReferenceType + GLUE + glueCoreferenceCurrentSentence;
                featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TYPE_TOKEN + typeToken, 1);

                // add type - token transition
                String glueCoreferencePrevSentence = glueCoreferenceLinkTokens(prevSentenceLastLink);
                String tokenType = glueCoreferencePrevSentence + GLUE + currentSentenceFirstLinkReferenceType;
                featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TOKEN_TYPE + tokenType, 1);

                // add token token transition
                String tokenToken = glueCoreferencePrevSentence + GLUE + glueCoreferenceCurrentSentence;
                featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TOKEN_TOKEN + tokenToken, 1);

                // exact matching token-token reference?
                if (glueCoreferencePrevSentence.equals(glueCoreferenceCurrentSentence)) {
                    featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TOKEN_TOKEN_MATCH, 1);
                }
            }

            // transition to the previous link, i.e. NOMINAL -> PRONOMINAL
            if (!sentencesAndLinks.lastKey().equals(currentSentencePos)) {
                // find the previous sentence
                List<CoreferenceLink> nextSentenceLinks = null;
                int nextSentNo = currentSentencePos;
                while (nextSentenceLinks == null && nextSentNo <= sentencesAndLinks.lastKey()) {
                    nextSentNo++;

                    if (sentencesAndLinks.containsKey(nextSentNo)) {
                        nextSentenceLinks = sentencesAndLinks.get(nextSentNo);
                    }
                }

                if (nextSentenceLinks == null) {
                    throw new IllegalStateException("Oops :))");
                }

                // distance to next sentence
                distanceToNextSentence.addValue(nextSentNo - currentSentencePos);

                // get the last link from the previous sentence
                CoreferenceLink nextSentenceFirstLink = nextSentenceLinks.get(0);

                // add type type transition
                String currentSentenceLastLinkReferenceType = currentSentenceLastLink.getReferenceType();
                String nextSentenceFirstLinkReferenceType = nextSentenceFirstLink.getReferenceType();
                String transitionType = currentSentenceLastLinkReferenceType + GLUE
                        + nextSentenceFirstLinkReferenceType;
                featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TYPE_TYPE + transitionType, 1);

                // add token - type transition
                String glueCoreferenceCurrentSent = glueCoreferenceLinkTokens(currentSentenceLastLink);
                String typeToken = glueCoreferenceCurrentSent + GLUE + nextSentenceFirstLinkReferenceType;
                featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TOKEN_TYPE + typeToken, 1);

                // add type - token transition
                String glueCoreferenceNextSent = glueCoreferenceLinkTokens(nextSentenceFirstLink);
                String tokenType = currentSentenceLastLinkReferenceType + GLUE + glueCoreferenceNextSent;
                featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TYPE_TOKEN + tokenType, 1);

                // add token token transition
                String tokenToken = glueCoreferenceCurrentSent + GLUE + glueCoreferenceNextSent;
                featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TOKEN_TOKEN + tokenToken, 1);

                // exact matching token-token reference?
                if (glueCoreferenceNextSent.equals(glueCoreferenceCurrentSent)) {
                    featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TOKEN_TOKEN_MATCH, 1);
                }
            }
        }

        // number of inter-sentence coreference links
        if (sentencesAndLinks.containsKey(currentSentencePos)) {
            int coreferenceLinks = sentencesAndLinks.get(currentSentencePos).size();
            interSentencesCorLinks.addValue(coreferenceLinks);
        }

        /*
        List<Integer> positions = positionsOfSentenceInCurrentChain(chain, sentence);
                
        // ok, we're in a chain
        if (!positions.isEmpty()) {
        log.debug(printChain(chain));
        log.debug(sentence.getCoveredText());
        log.debug(positions);
        Integer lastPosition = positions.get(positions.size() - 1);
        Integer firstPosition = positions.get(0);
                
        if (lastPosition == positions.size() - 1) {
            log.debug("Last sentence of chain");
        }
                
        log.debug("-----");
        }
        */
    }

    List<Feature> result = new ArrayList<>();

    log.debug(featuresAcrossAllChains);
    if (distanceToNextSentence.getN() > 0) {
        log.debug("Next:" + distanceToNextSentence);

        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_NEXT_MIN,
                distanceToNextSentence.getMin()));
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_NEXT_MAX,
                distanceToNextSentence.getMax()));
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_NEXT_AVG,
                distanceToNextSentence.getMean()));
    }
    if (distanceToPreviousSentence.getN() > 0) {

        log.debug("Prev: " + distanceToPreviousSentence);

        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_PREV_MIN,
                distanceToPreviousSentence.getMin()));
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_PREV_MAX,
                distanceToPreviousSentence.getMax()));
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_PREV_AVG,
                distanceToPreviousSentence.getMean()));
    }

    if (interSentencesCorLinks.getN() > 0) {
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_INTER_SENT_COR_MIN,
                interSentencesCorLinks.getMin()));
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_INTER_SENT_COR_MAX,
                interSentencesCorLinks.getMax()));
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_INTER_SENT_COR_AVG,
                interSentencesCorLinks.getMean()));
    }

    log.debug("----");

    for (String feat : featuresAcrossAllChains.getKeys()) {
        // binary
        result.add(new Feature(sentencePrefix + FEATURE_NAME + feat, 1));
    }

    return result;
}

From source file:org.a3badran.platform.logging.writer.MetricsWriter.java

private Map<String, Long> getAllMetrics() {
    Map<String, Long> metrics = new HashMap<String, Long>();
    for (Entry<String, DescriptiveStatistics> entry : sampleMetrics.entrySet()) {
        // create a copy to reduce locking
        String name = entry.getKey();
        DescriptiveStatistics stats = entry.getValue().copy();
        metrics.put(name + ".sampleCount", (long) stats.getN());
        metrics.put(name + ".max", (long) stats.getMax());
        metrics.put(name + ".min", (long) stats.getMin());
        metrics.put(name + ".avg", (long) stats.getMean());
        metrics.put(name + ".50p", (long) stats.getPercentile(50));
        metrics.put(name + ".90p", (long) stats.getPercentile(90));
        metrics.put(name + ".99p", (long) stats.getPercentile(99));
    }// w w  w.  j  a  v  a  2 s .c  om

    for (Entry<String, DescriptiveStatistics> cEntry : sampleCounterMetrics.entrySet()) {
        // create a copy to reduce locking
        String cName = cEntry.getKey();
        DescriptiveStatistics cStats = cEntry.getValue().copy();
        metrics.put(cName + ".max", (long) cStats.getMax());
        metrics.put(cName + ".min", (long) cStats.getMin());
        metrics.put(cName + ".avg", (long) cStats.getMean());
        metrics.put(cName + ".50p", (long) cStats.getPercentile(50));
        metrics.put(cName + ".90p", (long) cStats.getPercentile(90));
        metrics.put(cName + ".99p", (long) cStats.getPercentile(99));
    }

    for (Entry<String, AtomicLong> entry : scopeTotalMetrics.entrySet()) {
        metrics.put(entry.getKey(), entry.getValue().longValue());
    }

    for (Entry<String, AtomicLong> entry : appTotalMetrics.entrySet()) {
        metrics.put(entry.getKey(), entry.getValue().longValue());
    }

    return metrics;
}

From source file:org.bresearch.websec.test.CommonsMathTest.java

public void test1() throws Exception {

    /* min, max, mean, geometric mean, n, sum, sum of squares, 
     * standard deviation, variance, percentiles, skewness, kurtosis, median */

    // Get a DescriptiveStatistics instance using factory method
    DescriptiveStatistics stats = new DescriptiveStatistics();

    final double[] inputArray = { 4, 3, 3, 2 };

    // Add the data from the array
    for (int i = 0; i < inputArray.length; i++) {
        stats.addValue(inputArray[i]);/*from w w  w  . j  ava  2 s .com*/
    }

    // Compute some statistics
    double mean = stats.getMean();
    double std = stats.getStandardDeviation();
    long n = stats.getN();
    assertEquals("3.0", "" + mean);
    assertEquals("0.816496580927726", "" + std);
}

From source file:org.bresearch.websec.test.CommonsMathTest.java

public void test2() {
    final String data2 = (new WordProcessor()).filterOnlyAlphaNumeric(
            " !!!   Hello my name is a person.   Hello how are you doing.  hello, this is great.  What do you think?   ");

    final BotlistStringUtils utils = new BotlistStringUtils();
    final List<String> a = utils.buildWordList(data2);
    assertEquals(27, a.size());/*from w  w  w  .j  av  a 2  s .  c o  m*/

    DescriptiveStatistics stats = new DescriptiveStatistics();

    for (int i = 0; i < utils.mapReduceCount(a, -1).length; i++) {
        stats.addValue(utils.mapReduceCount(a, -1)[i]);
    }

    // Compute some statistics
    double mean = stats.getMean();
    double std = stats.getStandardDeviation();
    assertEquals("1.2666666666666666", "" + mean);
    assertEquals("0.5936168397046634", "" + std);

    long n = stats.getN();
    assertEquals("15", "" + n);

}

From source file:org.bresearch.websec.test.CommonsMathTest.java

public void test3() {
    final String data2 = (new WordProcessor()).filterOnlyAlphaNumeric(
            " !!!   Hello my name is a person.   Hello how are you doing.  hello, this is great.  What do you think?   ");

    final BotlistStringUtils utils = new BotlistStringUtils();
    final List<String> a = utils.buildWordList(data2);

    DescriptiveStatistics stats = new DescriptiveStatistics();
    for (int i = 0; i < utils.mapReduceWordSize(a, -1).length; i++) {
        stats.addValue(utils.mapReduceWordSize(a, -1)[i]);
    }/*from www .j a va  2 s.c o m*/

    // Compute some statistics        
    assertEquals("3.6", "" + stats.getMean());
    assertEquals("54.0", "" + stats.getSum());
}

From source file:org.bresearch.websec.test.CommonsMathTest.java

public void test5() {
    final DocumentWordStats docStats = new DocumentWordStats(ConstDoc.CONST_SM);
    final DescriptiveStatistics stats = docStats.mapReduceStats();

    System.out.println("" + stats.getSum());
    System.out.println("" + stats.getMean());
    System.out.println("" + stats.getN());
    System.out.println("" + stats.getGeometricMean());
    System.out.println("" + stats.getMax());

}