Example usage for org.apache.commons.math.stat.descriptive DescriptiveStatistics addValue

Introduction

In this page you can find the example usage for org.apache.commons.math.stat.descriptive DescriptiveStatistics addValue.

Prototype

public void addValue(double v)

Source Link

Document

Adds the value to the dataset.

Usage

From source file:de.tudarmstadt.ukp.experiments.argumentation.sequence.feature.coreference.CoreferenceFeatures.java

@Override
protected List<Feature> extract(JCas jCas, Sentence sentence, String sentencePrefix)
        throws TextClassificationException {
    List<List<CoreferenceLink>> coreferenceChains = extractCoreferenceChains(jCas);

    FrequencyDistribution<String> featuresAcrossAllChains = new FrequencyDistribution<>();
    DescriptiveStatistics chainLength = new DescriptiveStatistics();
    DescriptiveStatistics distanceToPreviousSentence = new DescriptiveStatistics();
    DescriptiveStatistics distanceToNextSentence = new DescriptiveStatistics();
    DescriptiveStatistics interSentencesCorLinks = new DescriptiveStatistics();

    for (List<CoreferenceLink> chain : coreferenceChains) {

        SortedMap<Integer, List<CoreferenceLink>> sentencesAndLinks = extractSentencesAndLinksFromChain(chain,
                jCas);// w w  w .  java2 s . c  o  m

        int currentSentencePos = getCurrentSentencePos(jCas, sentence);

        log.debug(sentencesAndLinks.keySet() + ", current " + currentSentencePos);

        // is the sentence in chain that spans more sentences?
        boolean partOfChain = sentencesAndLinks.containsKey(currentSentencePos) && sentencesAndLinks.size() > 1;

        // is part of a chain?
        if (partOfChain) {
            log.debug(chainToString(chain));
            featuresAcrossAllChains.inc(FN_PART_OF_CHAIN);

            // starts the chain?
            if (sentencesAndLinks.firstKey().equals(currentSentencePos)) {
                featuresAcrossAllChains.inc(FN_STARTS_THE_CHAIN);
            } else if (sentencesAndLinks.lastKey().equals(currentSentencePos)) {
                // ends the chain?
                featuresAcrossAllChains.inc(FN_ENDS_THE_CHAIN);
            } else {
                // in the middle of chain?
                featuresAcrossAllChains.inc(FN_IN_THE_MIDDLE_OF_CHAIN);
            }

            // length of the chain
            chainLength.addValue(sentencesAndLinks.size());

            List<CoreferenceLink> currentSentenceLinks = sentencesAndLinks.get(currentSentencePos);
            CoreferenceLink currentSentenceFirstLink = currentSentenceLinks.get(0);
            CoreferenceLink currentSentenceLastLink = currentSentenceLinks.get(currentSentenceLinks.size() - 1);

            // transition to the previous link, i.e. NOMINAL -> PRONOMINAL
            if (!sentencesAndLinks.firstKey().equals(currentSentencePos)) {
                // find the previous sentence
                List<CoreferenceLink> previousSentenceLinks = null;
                int prevSentNo = currentSentencePos;
                while (previousSentenceLinks == null && prevSentNo >= 0) {
                    prevSentNo--;

                    if (sentencesAndLinks.containsKey(prevSentNo)) {
                        previousSentenceLinks = sentencesAndLinks.get(prevSentNo);
                    }
                }

                if (previousSentenceLinks == null) {
                    throw new IllegalStateException("Oops :))");
                }

                // distance to previous sentence
                distanceToPreviousSentence.addValue(currentSentencePos - prevSentNo);

                // get the last link from the previous sentence
                CoreferenceLink prevSentenceLastLink = previousSentenceLinks
                        .get(previousSentenceLinks.size() - 1);

                // add type type transition
                String prevSentenceLastLinkReferenceType = prevSentenceLastLink.getReferenceType();
                String currentSentenceFirstLinkReferenceType = currentSentenceFirstLink.getReferenceType();
                String transitionType = prevSentenceLastLinkReferenceType + GLUE
                        + currentSentenceFirstLinkReferenceType;
                featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TYPE_TYPE + transitionType, 1);

                // add token - type transition
                String glueCoreferenceCurrentSentence = glueCoreferenceLinkTokens(currentSentenceFirstLink);
                String typeToken = prevSentenceLastLinkReferenceType + GLUE + glueCoreferenceCurrentSentence;
                featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TYPE_TOKEN + typeToken, 1);

                // add type - token transition
                String glueCoreferencePrevSentence = glueCoreferenceLinkTokens(prevSentenceLastLink);
                String tokenType = glueCoreferencePrevSentence + GLUE + currentSentenceFirstLinkReferenceType;
                featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TOKEN_TYPE + tokenType, 1);

                // add token token transition
                String tokenToken = glueCoreferencePrevSentence + GLUE + glueCoreferenceCurrentSentence;
                featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TOKEN_TOKEN + tokenToken, 1);

                // exact matching token-token reference?
                if (glueCoreferencePrevSentence.equals(glueCoreferenceCurrentSentence)) {
                    featuresAcrossAllChains.addSample(FN_TRANSITION_IN_TOKEN_TOKEN_MATCH, 1);
                }
            }

            // transition to the previous link, i.e. NOMINAL -> PRONOMINAL
            if (!sentencesAndLinks.lastKey().equals(currentSentencePos)) {
                // find the previous sentence
                List<CoreferenceLink> nextSentenceLinks = null;
                int nextSentNo = currentSentencePos;
                while (nextSentenceLinks == null && nextSentNo <= sentencesAndLinks.lastKey()) {
                    nextSentNo++;

                    if (sentencesAndLinks.containsKey(nextSentNo)) {
                        nextSentenceLinks = sentencesAndLinks.get(nextSentNo);
                    }
                }

                if (nextSentenceLinks == null) {
                    throw new IllegalStateException("Oops :))");
                }

                // distance to next sentence
                distanceToNextSentence.addValue(nextSentNo - currentSentencePos);

                // get the last link from the previous sentence
                CoreferenceLink nextSentenceFirstLink = nextSentenceLinks.get(0);

                // add type type transition
                String currentSentenceLastLinkReferenceType = currentSentenceLastLink.getReferenceType();
                String nextSentenceFirstLinkReferenceType = nextSentenceFirstLink.getReferenceType();
                String transitionType = currentSentenceLastLinkReferenceType + GLUE
                        + nextSentenceFirstLinkReferenceType;
                featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TYPE_TYPE + transitionType, 1);

                // add token - type transition
                String glueCoreferenceCurrentSent = glueCoreferenceLinkTokens(currentSentenceLastLink);
                String typeToken = glueCoreferenceCurrentSent + GLUE + nextSentenceFirstLinkReferenceType;
                featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TOKEN_TYPE + typeToken, 1);

                // add type - token transition
                String glueCoreferenceNextSent = glueCoreferenceLinkTokens(nextSentenceFirstLink);
                String tokenType = currentSentenceLastLinkReferenceType + GLUE + glueCoreferenceNextSent;
                featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TYPE_TOKEN + tokenType, 1);

                // add token token transition
                String tokenToken = glueCoreferenceCurrentSent + GLUE + glueCoreferenceNextSent;
                featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TOKEN_TOKEN + tokenToken, 1);

                // exact matching token-token reference?
                if (glueCoreferenceNextSent.equals(glueCoreferenceCurrentSent)) {
                    featuresAcrossAllChains.addSample(FN_TRANSITION_OUT_TOKEN_TOKEN_MATCH, 1);
                }
            }
        }

        // number of inter-sentence coreference links
        if (sentencesAndLinks.containsKey(currentSentencePos)) {
            int coreferenceLinks = sentencesAndLinks.get(currentSentencePos).size();
            interSentencesCorLinks.addValue(coreferenceLinks);
        }

        /*
        List<Integer> positions = positionsOfSentenceInCurrentChain(chain, sentence);
                
        // ok, we're in a chain
        if (!positions.isEmpty()) {
        log.debug(printChain(chain));
        log.debug(sentence.getCoveredText());
        log.debug(positions);
        Integer lastPosition = positions.get(positions.size() - 1);
        Integer firstPosition = positions.get(0);
                
        if (lastPosition == positions.size() - 1) {
            log.debug("Last sentence of chain");
        }
                
        log.debug("-----");
        }
        */
    }

    List<Feature> result = new ArrayList<>();

    log.debug(featuresAcrossAllChains);
    if (distanceToNextSentence.getN() > 0) {
        log.debug("Next:" + distanceToNextSentence);

        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_NEXT_MIN,
                distanceToNextSentence.getMin()));
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_NEXT_MAX,
                distanceToNextSentence.getMax()));
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_NEXT_AVG,
                distanceToNextSentence.getMean()));
    }
    if (distanceToPreviousSentence.getN() > 0) {

        log.debug("Prev: " + distanceToPreviousSentence);

        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_PREV_MIN,
                distanceToPreviousSentence.getMin()));
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_PREV_MAX,
                distanceToPreviousSentence.getMax()));
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_DIST_TO_PREV_AVG,
                distanceToPreviousSentence.getMean()));
    }

    if (interSentencesCorLinks.getN() > 0) {
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_INTER_SENT_COR_MIN,
                interSentencesCorLinks.getMin()));
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_INTER_SENT_COR_MAX,
                interSentencesCorLinks.getMax()));
        result.add(new Feature(sentencePrefix + FEATURE_NAME + FN_INTER_SENT_COR_AVG,
                interSentencesCorLinks.getMean()));
    }

    log.debug("----");

    for (String feat : featuresAcrossAllChains.getKeys()) {
        // binary
        result.add(new Feature(sentencePrefix + FEATURE_NAME + feat, 1));
    }

    return result;
}

From source file:com.facebook.presto.AbstractTestQueries.java

@Test
public void testTableSampleBernoulli() throws Exception {
    DescriptiveStatistics stats = new DescriptiveStatistics();

    int total = computeExpected("SELECT orderkey FROM orders", TupleInfo.SINGLE_LONG).getMaterializedTuples()
            .size();//from   ww  w.j a v  a2s.  c o  m

    for (int i = 0; i < 100; i++) {
        List<MaterializedTuple> values = computeActual("SELECT orderkey FROM ORDERS TABLESAMPLE BERNOULLI (50)")
                .getMaterializedTuples();

        assertEquals(values.size(), ImmutableSet.copyOf(values).size(), "TABLESAMPLE produced duplicate rows");
        stats.addValue(values.size() * 1.0 / total);
    }

    double mean = stats.getGeometricMean();
    assertTrue(mean > 0.45 && mean < 0.55,
            String.format("Expected mean sampling rate to be ~0.5, but was %s", mean));
}

From source file:com.joliciel.csvLearner.CSVLearner.java

private void doCommandEvaluate() throws IOException {
    if (resultFilePath == null)
        throw new RuntimeException("Missing argument: resultFile");
    if (featureDir == null)
        throw new RuntimeException("Missing argument: featureDir");
    if (testIdFilePath != null) {
        if (crossValidation)
            throw new RuntimeException("Cannot combine testIdFile with cross validation");
        if (testSegment >= 0) {
            throw new RuntimeException("Cannot combine testIdFile with test segment");
        }//  w w  w  . ja va 2s  .  c  o m
    }
    if (!crossValidation && testIdFilePath == null) {
        if (testSegment < 0)
            throw new RuntimeException("Missing argument: testSegment");
        if (testSegment > 9)
            throw new RuntimeException("testSegment must be an integer between 0 and 9");
    }
    if (outDirPath == null)
        throw new RuntimeException("Missing argument: outDir");

    LOG.info("Generating event list from CSV files...");
    CSVEventListReader reader = this.getReader(TrainingSetType.TEST_SEGMENT, false);

    GenericEvents events = reader.getEvents();

    File outDir = new File(outDirPath);
    outDir.mkdirs();
    String fileBase = this.featureDir.replace('/', '_');
    fileBase = fileBase.replace(':', '_');
    fileBase = fileBase + "_cutoff" + cutoff;

    if (generateEventFile) {
        File eventFile = new File(outDir, fileBase + "_events.txt");
        this.generateEventFile(eventFile, events);
    }

    File fscoreFile = new File(outDir, fileBase + "_fscores.csv");
    Writer fscoreFileWriter = new BufferedWriter(
            new OutputStreamWriter(new FileOutputStream(fscoreFile, false), "UTF8"));

    File outcomeFile = new File(outDir, fileBase + "_outcomes.csv");
    Writer outcomeFileWriter = new BufferedWriter(
            new OutputStreamWriter(new FileOutputStream(outcomeFile, false), "UTF8"));

    try {
        if (!crossValidation) {
            MaxentModel maxentModel = this.train(events, null);

            this.evaluate(maxentModel, events, fscoreFileWriter, outcomeFileWriter);
        } else {
            DescriptiveStatistics accuracyStats = new DescriptiveStatistics();
            Map<String, DescriptiveStatistics[]> outcomeFscoreStats = new TreeMap<String, DescriptiveStatistics[]>();
            for (int segment = 0; segment <= 9; segment++) {
                outcomeFileWriter.write("Run " + segment + ",\n");
                fscoreFileWriter.write("Run " + segment + ",\n");
                if (balanceOutcomes) {
                    for (String outcome : reader.getOutcomes()) {
                        int i = 0;
                        for (GenericEvent event : events) {
                            if (event.getOutcome().equals(outcome)) {
                                boolean test = i % 10 == segment;
                                event.setTest(test);
                                i++;
                            }
                        }
                    }
                } else {
                    int i = 0;
                    for (GenericEvent event : events) {
                        boolean test = i % 10 == segment;
                        event.setTest(test);
                        i++;
                    }
                }

                MaxentModel maxentModel = this.train(events, null);
                FScoreCalculator<String> fscoreCalculator = this.evaluate(maxentModel, events, fscoreFileWriter,
                        outcomeFileWriter);

                accuracyStats.addValue(fscoreCalculator.getTotalFScore());
                for (String outcome : fscoreCalculator.getOutcomeSet()) {
                    DescriptiveStatistics[] stats = outcomeFscoreStats.get(outcome);
                    if (stats == null) {
                        stats = new DescriptiveStatistics[3];
                        stats[0] = new DescriptiveStatistics();
                        stats[1] = new DescriptiveStatistics();
                        stats[2] = new DescriptiveStatistics();
                        outcomeFscoreStats.put(outcome, stats);
                    }
                    stats[0].addValue(fscoreCalculator.getPrecision(outcome));
                    stats[1].addValue(fscoreCalculator.getRecall(outcome));
                    stats[2].addValue(fscoreCalculator.getFScore(outcome));
                } // next outcome

                outcomeFileWriter.write("\n");

            } // next segment

            fscoreFileWriter.write(
                    "outcome,precision avg., precision dev., recall avg., recall dev., f-score avg., f-score dev.,\n");
            for (String outcome : outcomeFscoreStats.keySet()) {
                DescriptiveStatistics[] stats = outcomeFscoreStats.get(outcome);
                fscoreFileWriter
                        .write(CSVFormatter.format(outcome) + "," + CSVFormatter.format(stats[0].getMean())
                                + "," + CSVFormatter.format(stats[0].getStandardDeviation()) + ","
                                + CSVFormatter.format(stats[1].getMean()) + ","
                                + CSVFormatter.format(stats[1].getStandardDeviation()) + ","
                                + CSVFormatter.format(stats[2].getMean()) + ","
                                + CSVFormatter.format(stats[2].getStandardDeviation()) + "," + "\n");
            }
            fscoreFileWriter.write("TOTAL,,,,," + CSVFormatter.format(accuracyStats.getMean()) + ","
                    + CSVFormatter.format(accuracyStats.getStandardDeviation()) + ",\n");

            LOG.info("Accuracy mean: " + accuracyStats.getMean());
            LOG.info("Accuracy std dev: " + accuracyStats.getStandardDeviation());
        }
    } finally {
        fscoreFileWriter.flush();
        fscoreFileWriter.close();
        outcomeFileWriter.flush();
        outcomeFileWriter.close();
    }

    LOG.info("#### Complete ####");
}

From source file:net.sf.mzmine.modules.peaklistmethods.dataanalysis.heatmaps.HeatMapTask.java

private double[][] groupingDataset(UserParameter selectedParameter, String referenceGroup) {
    // Collect all data files
    Vector<RawDataFile> allDataFiles = new Vector<RawDataFile>();
    DescriptiveStatistics meanControlStats = new DescriptiveStatistics();
    DescriptiveStatistics meanGroupStats = new DescriptiveStatistics();
    allDataFiles.addAll(Arrays.asList(peakList.getRawDataFiles()));

    // Determine the reference group and non reference group (the rest of
    // the samples) for raw data files
    List<RawDataFile> referenceDataFiles = new ArrayList<RawDataFile>();
    List<RawDataFile> nonReferenceDataFiles = new ArrayList<RawDataFile>();

    List<String> groups = new ArrayList<String>();
    MZmineProject project = MZmineCore.getCurrentProject();

    for (RawDataFile rawDataFile : allDataFiles) {

        Object paramValue = project.getParameterValue(selectedParameter, rawDataFile);
        if (!groups.contains(String.valueOf(paramValue))) {
            groups.add(String.valueOf(paramValue));
        }//from   w ww .j av a 2  s .c  o m
        if (String.valueOf(paramValue).equals(referenceGroup)) {

            referenceDataFiles.add(rawDataFile);
        } else {

            nonReferenceDataFiles.add(rawDataFile);
        }
    }

    int numRows = 0;
    for (int row = 0; row < peakList.getNumberOfRows(); row++) {

        if (!onlyIdentified || (onlyIdentified && peakList.getRow(row).getPeakIdentities().length > 0)) {
            numRows++;
        }
    }

    // Create a new aligned peak list with all the samples if the reference
    // group has to be shown or with only
    // the non reference group if not.
    double[][] dataMatrix = new double[groups.size() - 1][numRows];
    pValueMatrix = new String[groups.size() - 1][numRows];

    // data files that should be in the heat map
    List<RawDataFile> shownDataFiles = nonReferenceDataFiles;

    for (int row = 0, rowIndex = 0; row < peakList.getNumberOfRows(); row++) {
        PeakListRow rowPeak = peakList.getRow(row);
        if (!onlyIdentified || (onlyIdentified && rowPeak.getPeakIdentities().length > 0)) {
            // Average area or height of the reference group
            meanControlStats.clear();
            for (int column = 0; column < referenceDataFiles.size(); column++) {

                if (rowPeak.getPeak(referenceDataFiles.get(column)) != null) {

                    if (area) {

                        meanControlStats.addValue(rowPeak.getPeak(referenceDataFiles.get(column)).getArea());
                    } else {

                        meanControlStats.addValue(rowPeak.getPeak(referenceDataFiles.get(column)).getHeight());
                    }

                }
            }

            // Divide the area or height of each peak by the average of the
            // area or height of the reference peaks in each row
            int columnIndex = 0;
            for (int column = 0; column < groups.size(); column++) {
                String group = groups.get(column);
                meanGroupStats.clear();
                if (!group.equals(referenceGroup)) {

                    for (int dataColumn = 0; dataColumn < shownDataFiles.size(); dataColumn++) {

                        Object paramValue = project.getParameterValue(selectedParameter,
                                shownDataFiles.get(dataColumn));
                        if (rowPeak.getPeak(shownDataFiles.get(dataColumn)) != null
                                && String.valueOf(paramValue).equals(group)) {

                            Feature peak = rowPeak.getPeak(shownDataFiles.get(dataColumn));

                            if (!Double.isInfinite(peak.getArea()) && !Double.isNaN(peak.getArea())) {

                                if (area) {

                                    meanGroupStats.addValue(peak.getArea());
                                } else {

                                    meanGroupStats.addValue(peak.getHeight());
                                }
                            }

                        }
                    }

                    double value = meanGroupStats.getMean() / meanControlStats.getMean();
                    if (meanGroupStats.getN() > 1 && meanControlStats.getN() > 1) {
                        pValueMatrix[columnIndex][rowIndex] = this.getPvalue(meanGroupStats, meanControlStats);
                    } else {
                        pValueMatrix[columnIndex][rowIndex] = "";
                    }

                    if (log) {

                        value = Math.log(value);
                    }
                    dataMatrix[columnIndex++][rowIndex] = value;
                }
            }
            rowIndex++;
        }
    }

    // Scale the data dividing the peak area/height by the standard
    // deviation of each column
    if (scale) {
        scale(dataMatrix);
    }

    // Create two arrays: row and column names
    rowNames = new String[dataMatrix[0].length];
    colNames = new String[groups.size() - 1];

    int columnIndex = 0;
    for (String group : groups) {

        if (!group.equals(referenceGroup)) {

            colNames[columnIndex++] = group;
        }
    }
    for (int row = 0, rowIndex = 0; row < peakList.getNumberOfRows(); row++) {
        if (!onlyIdentified || (onlyIdentified && peakList.getRow(row).getPeakIdentities().length > 0)) {
            if (peakList.getRow(row).getPeakIdentities() != null
                    && peakList.getRow(row).getPeakIdentities().length > 0) {

                rowNames[rowIndex++] = peakList.getRow(row).getPreferredPeakIdentity().getName();
            } else {

                rowNames[rowIndex++] = "Unknown";
            }
        }
    }

    return dataMatrix;
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * Clear out anything found in the right & left margins
 * @param sourceImage// w w  w  .j  av a  2  s . c  om
 */
void cleanMargins(SourceImage sourceImage) {
    LOG.debug("########## cleanMargins #########");

    int minCardinalityForMargin = 8;
    double averageShapeWidth = sourceImage.getAverageShapeWidth();

    LOG.debug("Finding right margin");
    double rightLimit = (double) sourceImage.getWidth() * 0.67;

    // first, create a DBScan cluster of all rows near the right-hand side
    List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>();
    List<double[]> rightCoordinates = new ArrayList<double[]>();

    for (RowOfShapes row : sourceImage.getRows()) {
        double right = row.getRight();
        if (right >= rightLimit) {
            LOG.trace(row.toString());
            LOG.trace(
                    "Right: " + right + " + " + row.getXAdjustment() + " = " + (right - row.getXAdjustment()));
            right -= row.getXAdjustment();
            rightHandRows.add(row);
            rightCoordinates.add(new double[] { right });
        }
    }

    DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows,
            rightCoordinates);
    Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(averageShapeWidth, minCardinalityForMargin,
            true);

    TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>(
            new CardinalityComparator<RowOfShapes>());
    orderedRowClusters.addAll(rowClusters);

    int i = 0;

    // find the right-most cluster with sufficient cardinality, and assume it's the right margin
    DescriptiveStatistics rightMarginStats = null;
    for (Set<RowOfShapes> cluster : orderedRowClusters) {
        DescriptiveStatistics rightStats = new DescriptiveStatistics();
        for (RowOfShapes row : cluster)
            rightStats.addValue(row.getRight() - row.getXAdjustment());

        LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
        LOG.debug("Right mean : " + rightStats.getMean());
        LOG.debug("Right std dev: " + rightStats.getStandardDeviation());

        if (cluster.size() >= minCardinalityForMargin
                && (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean())) {
            rightMarginStats = rightStats;
        }
        i++;
    }

    // see how many rows would violate this margin - if too many, assume no margin
    // these rows are only rows which extend across the margin
    if (rightMarginStats != null) {
        LOG.debug("Right margin mean : " + rightMarginStats.getMean());
        LOG.debug("Right margin std dev: " + rightMarginStats.getStandardDeviation());

        double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth();
        LOG.debug("rightMarginLimit: " + rightMarginLimit);
        int numRowsToChop = 0;
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getRight() >= rightLimit) {
                if (row.getRight() - row.getXAdjustment() >= rightMarginLimit
                        && row.getLeft() - row.getXAdjustment() <= rightMarginLimit) {
                    LOG.debug("Found overlapping row : " + row);
                    LOG.debug("Adjusted right : " + (row.getRight() - row.getXAdjustment()));
                    numRowsToChop++;
                }
            }
        }
        if (numRowsToChop >= 3) {
            LOG.debug("Too many overlapping rows - ignoring margin");
            rightMarginStats = null;
        }
    }

    if (rightMarginStats != null) {
        double rightMarginLimit = rightMarginStats.getMean() + sourceImage.getAverageShapeWidth();
        List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            double right = row.getRight() - row.getXAdjustment();
            LOG.trace(row.toString());
            LOG.trace("Adjusted right: " + right);

            if (right >= rightMarginLimit) {
                LOG.trace("Has out-of-margin stuff!");
                // need to chop off groups to the right of this threshold
                List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>();
                for (GroupOfShapes group : row.getGroups()) {
                    if (group.getLeft() - row.getXAdjustment() > rightMarginLimit) {
                        groupsToChop.add(group);
                        LOG.debug("Chopping group outside of right margin: " + group);
                    }
                }
                for (GroupOfShapes group : groupsToChop) {
                    row.getShapes().removeAll(group.getShapes());
                }
                row.getGroups().removeAll(groupsToChop);

                if (row.getGroups().size() == 0) {
                    LOG.debug("Removing empty " + row);
                    rowsToRemove.add(row);
                } else {
                    row.recalculate();
                    row.assignGuideLines();
                }
            } // does this row extend beyond the margin?
        } // next row
        sourceImage.getRows().removeAll(rowsToRemove);
    } // have a right margin

    LOG.debug("Finding left margin");
    double leftLimit = (double) sourceImage.getWidth() * 0.33;

    // first, create a DBScan cluster of all rows near the left-hand side
    List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>();
    List<double[]> leftCoordinates = new ArrayList<double[]>();

    for (RowOfShapes row : sourceImage.getRows()) {
        double left = row.getLeft();
        if (left <= leftLimit) {
            LOG.trace(row.toString());
            LOG.trace("Left: " + left + " - " + row.getXAdjustment() + " = " + (left - row.getXAdjustment()));
            left -= row.getXAdjustment();
            leftHandRows.add(row);
            leftCoordinates.add(new double[] { left });
        }
    }

    DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows,
            leftCoordinates);
    Set<Set<RowOfShapes>> rowClustersLeft = leftMarginClusterer.cluster(averageShapeWidth,
            minCardinalityForMargin, true);

    TreeSet<Set<RowOfShapes>> orderedRowClustersLeft = new TreeSet<Set<RowOfShapes>>(
            new CardinalityComparator<RowOfShapes>());
    orderedRowClustersLeft.addAll(rowClustersLeft);

    i = 0;

    // find the left-most cluster with sufficient cardinality, and assume it's the left margin
    DescriptiveStatistics leftMarginStats = null;
    for (Set<RowOfShapes> cluster : orderedRowClustersLeft) {
        DescriptiveStatistics leftStats = new DescriptiveStatistics();
        for (RowOfShapes row : cluster)
            leftStats.addValue(row.getLeft() - row.getXAdjustment());

        LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
        LOG.debug("Left mean : " + leftStats.getMean());
        LOG.debug("Left std dev: " + leftStats.getStandardDeviation());

        if (cluster.size() >= minCardinalityForMargin
                && (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean())) {
            leftMarginStats = leftStats;
        }
        i++;
    }

    // see how many rows would violate this margin - if too many, assume no margin
    // these rows are only rows which extend across the margin
    if (leftMarginStats != null) {
        LOG.debug("Left margin mean : " + leftMarginStats.getMean());
        LOG.debug("Left margin std dev: " + leftMarginStats.getStandardDeviation());

        double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth();
        LOG.debug("leftMarginLimit: " + leftMarginLimit);
        int numRowsToChop = 0;
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getLeft() <= leftLimit) {
                if (row.getLeft() - row.getXAdjustment() <= leftMarginLimit
                        && row.getRight() - row.getXAdjustment() >= leftMarginLimit) {
                    LOG.debug("Found overlapping row : " + row);
                    LOG.debug("Adjusted left : " + (row.getLeft() - row.getXAdjustment()));
                    numRowsToChop++;
                }
            }
        }
        if (numRowsToChop >= 3) {
            LOG.debug("Too many overlapping rows - ignoring margin");
            leftMarginStats = null;
        }
    }

    if (leftMarginStats != null) {
        double leftMarginLimit = leftMarginStats.getMean() - sourceImage.getAverageShapeWidth();
        List<RowOfShapes> rowsToRemove = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            double left = row.getLeft() - row.getXAdjustment();
            LOG.trace(row.toString());
            LOG.trace("Adjusted left: " + left);

            if (left <= leftMarginLimit) {
                LOG.trace("Has out-of-margin stuff!");
                // need to chop off groups to the left of this threshold
                List<GroupOfShapes> groupsToChop = new ArrayList<GroupOfShapes>();
                for (GroupOfShapes group : row.getGroups()) {
                    if (group.getRight() - row.getXAdjustment() < leftMarginLimit) {
                        groupsToChop.add(group);
                        LOG.debug("Chopping group outside of left margin: " + group);
                    }
                }
                for (GroupOfShapes group : groupsToChop) {
                    row.getShapes().removeAll(group.getShapes());
                }
                row.getGroups().removeAll(groupsToChop);

                if (row.getGroups().size() == 0) {
                    LOG.debug("Removing empty " + row);
                    rowsToRemove.add(row);
                } else {
                    row.recalculate();
                    row.assignGuideLines();
                }
            } // does this row extend beyond the margin?
        } // next row
        sourceImage.getRows().removeAll(rowsToRemove);
    } // have a left margin
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

List<RowOfShapes> groupShapesIntoRows(SourceImage sourceImage, List<Shape> shapes, List<Rectangle> whiteAreas,
        boolean useSlope) {
    LOG.debug("########## groupShapesIntoRows #########");
    LOG.debug("useSlope? " + useSlope);

    List<RowOfShapes> rows = new ArrayList<RowOfShapes>();
    for (Shape shape : shapes)
        shape.setRow(null);/*from  w w  w.  j  a va  2 s .c om*/

    List<Shape> shapesToRemove = new ArrayList<Shape>();
    for (Shape shape : shapes) {
        for (Rectangle whiteArea : whiteAreas) {
            double whiteAreaRight = whiteArea.getRight();
            double whiteAreaLeft = whiteArea.getLeft();
            if (useSlope) {
                double xAdjustment = sourceImage.getXAdjustment(shape.getTop());

                whiteAreaRight += xAdjustment;
                whiteAreaLeft += xAdjustment;
            }

            if (whiteAreaRight > shape.getRight() && whiteAreaLeft < shape.getLeft()
                    && whiteArea.getTop() < shape.getTop() && whiteArea.getBottom() > shape.getBottom()) {
                // shape is surrounded
                shapesToRemove.add(shape);
                LOG.debug("Removing shape " + shape);
                LOG.debug("Surrounded by white area: " + whiteArea);
            }
        }
    }
    shapes.removeAll(shapesToRemove);

    // calculate the means
    // get average shape width & height
    DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics();
    for (Shape shape : shapes) {
        shapeWidthStats.addValue(shape.getWidth());
    }
    double averageShapeWidth = shapeWidthStats.getPercentile(50);
    LOG.debug("averageShapeWidth: " + averageShapeWidth);

    // now, arrange the shapes in rows
    // we're guaranteed that no two shapes overlap at this point.
    // Now, it's possible that two shapes in the same line have no vertical overlap (e.g. a comma and an apostrophe)
    // so we have to go searching a bit further afield, say five shapes in each direction
    // but if we go too far, we may end up joining two lines together if the page isn't quite straight

    // let's begin with any old shape and find the shapes closest to it horizontally
    // e.g. up to 8 horizontal means to the right and left
    // as we find shapes that go with it, we add them to the same line
    int i = 0;
    int j = 0;
    int numberOfMeanWidthsForSearch = 8;
    LOG.debug("numberOfMeanWidthsForSearch: " + numberOfMeanWidthsForSearch);
    LOG.debug("search distance: " + averageShapeWidth * numberOfMeanWidthsForSearch);

    for (Shape shape : shapes) {
        if (shape.getRow() == null) {
            RowOfShapes row = graphicsService.getEmptyRow(sourceImage);
            row.addShape(shape);
            row.setIndex(j++);
            rows.add(row);
            LOG.trace("========= New row " + row.getIndex() + "============");
            LOG.trace("Adding " + shape + " to row " + row.getIndex());
        }
        int searchLeft = (int) ((double) shape.getLeft() - (numberOfMeanWidthsForSearch * averageShapeWidth));
        int searchRight = (int) ((double) shape.getRight() + (numberOfMeanWidthsForSearch * averageShapeWidth));
        LOG.trace("Shape " + i++ + ": " + shape + "(row " + shape.getRow().getIndex() + ")");
        LOG.trace("searchLeft: " + searchLeft);
        LOG.trace("searchRight: " + searchRight);

        // construct an array to represent where white areas overlap with the search area
        int[][] leftSearchArea = new int[shape.getLeft() - searchLeft][2];
        int[][] rightSearchArea = new int[searchRight - shape.getRight()][2];
        for (int k = 0; k < leftSearchArea.length; k++) {
            leftSearchArea[k][0] = shape.getTop();
            leftSearchArea[k][1] = shape.getBottom();
        }
        for (int k = 0; k < rightSearchArea.length; k++) {
            rightSearchArea[k][0] = shape.getTop();
            rightSearchArea[k][1] = shape.getBottom();
        }

        int newSearchLeft = searchLeft;
        int newSearchRight = searchRight;
        for (Rectangle whiteArea : whiteAreas) {
            double whiteAreaRight = whiteArea.getRight();
            double whiteAreaLeft = whiteArea.getLeft();
            if (useSlope) {
                double xAdjustment = sourceImage.getXAdjustment(shape.getTop());

                whiteAreaRight += xAdjustment;
                whiteAreaLeft += xAdjustment;
                LOG.trace(whiteArea + ", xAdjustment=" + xAdjustment + " , whiteAreaLeft=" + whiteAreaLeft
                        + " , whiteAreaRight=" + whiteAreaRight);
            }

            if (whiteAreaRight > newSearchLeft && whiteAreaLeft < shape.getLeft()
                    && whiteArea.getTop() <= shape.getBottom() && whiteArea.getBottom() >= shape.getTop()) {

                LOG.trace("overlap on left with: " + whiteArea.toString());

                if (whiteArea.getTop() <= shape.getTop() && whiteArea.getBottom() >= shape.getBottom()
                        && whiteAreaRight > newSearchLeft) {
                    newSearchLeft = (int) Math.round(whiteAreaRight);
                    LOG.trace("Complete, newSearchLeft = " + newSearchLeft);
                } else {
                    LOG.trace("Partial, starting at " + whiteArea.getRight());
                    for (int k = whiteArea.getRight() - searchLeft; k >= 0; k--) {
                        if (k < leftSearchArea.length) {
                            if (whiteArea.getBottom() < shape.getBottom()
                                    && leftSearchArea[k][0] < whiteArea.getBottom())
                                leftSearchArea[k][0] = whiteArea.getBottom() + 1;
                            else if (whiteArea.getTop() > shape.getTop()
                                    && leftSearchArea[k][1] > whiteArea.getTop())
                                leftSearchArea[k][1] = whiteArea.getTop() - 1;

                            if (leftSearchArea[k][0] >= leftSearchArea[k][1]
                                    && searchLeft + k > newSearchLeft) {
                                newSearchLeft = searchLeft + k;
                                LOG.trace("Complete from " + newSearchLeft);
                                break;
                            }
                        }
                    }
                    //                  if (LOG.isTraceEnabled()) {
                    //                     StringBuilder sb = new StringBuilder();
                    //                     for (int k=0;k<leftSearchArea.length;k++) {
                    //                        String top = "" + (leftSearchArea[k][0]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", top)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                     sb = new StringBuilder();
                    //                     for (int k=0;k<leftSearchArea.length;k++) {
                    //                        String bottom = "" + (leftSearchArea[k][1]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", bottom)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                  }
                }
            } else if (whiteAreaLeft < newSearchRight && whiteAreaRight > shape.getRight()
                    && whiteArea.getTop() <= shape.getBottom() && whiteArea.getBottom() >= shape.getTop()) {
                LOG.trace("overlap on right with: " + whiteArea.toString());

                if (whiteArea.getTop() <= shape.getTop() && whiteArea.getBottom() >= shape.getBottom()
                        && newSearchRight > whiteAreaLeft) {
                    newSearchRight = (int) Math.round(whiteAreaLeft);
                    LOG.trace("Complete, newSearchRight = " + newSearchRight);

                } else {
                    LOG.trace("Partial, starting at " + whiteArea.getLeft());
                    for (int k = whiteArea.getLeft() - shape.getRight(); k < rightSearchArea.length; k++) {
                        if (k > 0 && k < leftSearchArea.length && k < rightSearchArea.length) {
                            if (whiteArea.getBottom() < shape.getBottom()
                                    && leftSearchArea[k][0] < whiteArea.getBottom())
                                rightSearchArea[k][0] = whiteArea.getBottom() + 1;
                            else if (whiteArea.getTop() > shape.getTop()
                                    && leftSearchArea[k][1] > whiteArea.getTop())
                                rightSearchArea[k][1] = whiteArea.getTop() - 1;

                            if (rightSearchArea[k][0] >= rightSearchArea[k][1]
                                    && newSearchRight > shape.getRight() + k) {
                                newSearchRight = shape.getRight() + k;
                                LOG.trace("Complete from " + newSearchRight);
                                break;
                            }
                        }
                    }
                    //                  if (LOG.isTraceEnabled()) {
                    //                     StringBuilder sb = new StringBuilder();
                    //                     for (int k=0;k<rightSearchArea.length;k++) {
                    //                        String top = "" + (rightSearchArea[k][0]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", top)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                     sb = new StringBuilder();
                    //                     for (int k=0;k<rightSearchArea.length;k++) {
                    //                        String bottom = "" + (rightSearchArea[k][1]-shape.getTop());
                    //                        sb.append(String.format("%1$#" + 3 + "s", bottom)+ ",");
                    //                     }
                    //                     LOG.trace(sb.toString());
                    //                  }
                }
            }
        }
        LOG.trace("searchLeft adjusted for white columns: " + newSearchLeft);
        LOG.trace("searchRight adjusted for white columns: " + newSearchRight);

        // min 10% overlap to assume same row
        double minOverlap = 0.10;

        for (Shape otherShape : shapes) {
            boolean haveSomeOverlap = false;
            if (!shape.getRow().equals(otherShape.getRow()) && !otherShape.equals(shape)) {

                // shapes are arranged from the top down
                if (otherShape.getTop() > shape.getBottom()) {
                    break;
                }

                if (otherShape.getRight() > newSearchLeft && otherShape.getRight() < shape.getLeft()
                        && otherShape.getTop() <= shape.getBottom()
                        && otherShape.getBottom() >= shape.getTop()) {
                    int k = otherShape.getRight() - searchLeft;
                    if (otherShape.getTop() <= leftSearchArea[k][1]
                            && otherShape.getBottom() >= leftSearchArea[k][0])
                        haveSomeOverlap = true;
                } else if (otherShape.getLeft() < newSearchRight && otherShape.getLeft() > shape.getRight()
                        && otherShape.getTop() <= shape.getBottom()
                        && otherShape.getBottom() >= shape.getTop()) {
                    int k = otherShape.getLeft() - shape.getRight();
                    if (otherShape.getTop() <= rightSearchArea[k][1]
                            && otherShape.getBottom() >= rightSearchArea[k][0])
                        haveSomeOverlap = true;
                }
                if (haveSomeOverlap) {
                    int overlap1 = shape.getBottom() - otherShape.getTop() + 1;
                    int overlap2 = otherShape.getBottom() - shape.getTop() + 1;
                    int overlap = overlap1 < overlap2 ? overlap1 : overlap2;
                    boolean addShapeToRow = false;
                    if ((((double) overlap / (double) shape.getHeight()) > minOverlap)
                            || (((double) overlap / (double) otherShape.getHeight()) > minOverlap)) {
                        addShapeToRow = true;
                    }

                    if (addShapeToRow) {
                        LOG.debug("Adding " + otherShape + " to row " + shape.getRow().getIndex());
                        if (otherShape.getRow() == null) {
                            shape.getRow().addShape(otherShape);
                        } else {
                            // two rows need to be merged
                            LOG.debug("========= Merge rows " + shape.getRow().getIndex() + " with "
                                    + otherShape.getRow().getIndex() + "==========");
                            RowOfShapes otherRow = otherShape.getRow();
                            shape.getRow().addShapes(otherRow.getShapes());
                            rows.remove(otherRow);
                        }
                    }
                } // add shape to row ?
            } // should shape be considered?
        } // next other shape
    } // next shape

    return rows;
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * We attempt to remove specks, where a speck is defined as 
 * a relatively small shape at a relatively large distance from other shapes.
 * @param sourceImage//from ww w .  j ava 2 s .  c om
 */
void removeSpecks(SourceImage sourceImage, List<Shape> shapes) {
    LOG.debug("########## removeSpecks #########");

    DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics();
    DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics();

    for (Shape shape : shapes) {
        shapeWidthStats.addValue(shape.getWidth());
        shapeHeightStats.addValue(shape.getHeight());
    }

    double shapeWidthMedian = shapeWidthStats.getPercentile(65);
    double shapeHeightMedian = shapeHeightStats.getPercentile(65);
    LOG.debug("meanShapeWidth: " + shapeWidthMedian);
    LOG.debug("meanShapeHeight: " + shapeHeightMedian);

    int maxSpeckHeightFloor = (int) Math.ceil(shapeHeightMedian / 6.0);
    int maxSpeckWidthFloor = (int) Math.ceil(shapeWidthMedian / 6.0);
    int maxSpeckHeightCeiling = maxSpeckHeightFloor * 2;
    int maxSpeckWidthCeiling = maxSpeckWidthFloor * 2;

    int speckXDistanceThresholdFloor = (int) Math.floor(shapeWidthMedian);
    int speckYDistanceThresholdFloor = (int) Math.floor(shapeHeightMedian / 4.0);
    int speckXDistanceThresholdCeiling = speckXDistanceThresholdFloor * 2;
    int speckYDistanceThresholdCeiling = speckYDistanceThresholdFloor * 2;

    LOG.debug("maxSpeckHeightFloor=" + maxSpeckHeightFloor);
    LOG.debug("maxSpeckWidthFloor=" + maxSpeckWidthFloor);
    LOG.debug("speckXDistanceThresholdFloor=" + speckXDistanceThresholdFloor);
    LOG.debug("speckYDistanceThresholdFloor=" + speckYDistanceThresholdFloor);
    LOG.debug("maxSpeckHeightCeiling=" + maxSpeckHeightCeiling);
    LOG.debug("maxSpeckWidthCeiling=" + maxSpeckWidthCeiling);
    LOG.debug("speckXDistanceThresholdCeiling=" + speckXDistanceThresholdCeiling);
    LOG.debug("speckYDistanceThresholdCeiling=" + speckYDistanceThresholdCeiling);

    List<Shape> specks = new ArrayList<Shape>();
    List<double[]> speckCoordinates = new ArrayList<double[]>();
    for (Shape shape : shapes) {
        if (shape.getHeight() < maxSpeckHeightCeiling && shape.getWidth() < maxSpeckWidthCeiling) {
            specks.add(shape);
            speckCoordinates.add(shape.getCentrePoint());
        }
    }

    // group the specks into clusters, which will be added or removed as a whole
    // Note that a cluster could be a valid diacritic that's split into a few specks
    // or just a bunch of specks off on their own
    DBSCANClusterer<Shape> clusterer = new DBSCANClusterer<Shape>(specks, speckCoordinates);
    Set<Set<Shape>> speckClusters = clusterer.cluster(speckXDistanceThresholdFloor, 2, true);
    List<Shape> specksToRemove = new ArrayList<Shape>();
    for (Set<Shape> speckCluster : speckClusters) {

        int speckHeight = 0;
        int speckWidth = 0;
        int clusterTop = -1;
        int clusterBottom = -1;
        int clusterRight = -1;
        int clusterLeft = -1;
        for (Shape speck : speckCluster) {
            LOG.debug("Speck?, " + speck);
            if (speck.getWidth() > speckWidth)
                speckWidth = speck.getWidth();
            if (speck.getHeight() > speckHeight)
                speckHeight = speck.getHeight();

            if (clusterTop < 0 || speck.getTop() < clusterTop)
                clusterTop = speck.getTop();
            if (clusterLeft < 0 || speck.getLeft() < clusterLeft)
                clusterLeft = speck.getLeft();
            if (speck.getBottom() > clusterBottom)
                clusterBottom = speck.getBottom();
            if (speck.getRight() > clusterRight)
                clusterRight = speck.getRight();

        }

        boolean useWidth = speckWidth > speckHeight;
        double scale = 1.0;
        if (useWidth)
            scale = speckWidth < maxSpeckWidthFloor ? 0.0
                    : (speckWidth > maxSpeckWidthCeiling ? 1.0
                            : ((double) speckWidth - maxSpeckWidthFloor)
                                    / (maxSpeckWidthCeiling - maxSpeckWidthFloor));
        else
            scale = speckHeight < maxSpeckHeightFloor ? 0.0
                    : (speckHeight > maxSpeckHeightCeiling ? 1.0
                            : ((double) speckHeight - maxSpeckHeightFloor)
                                    / (maxSpeckHeightCeiling - maxSpeckHeightFloor));

        int speckXDistanceThreshold = (int) Math.ceil(speckXDistanceThresholdFloor
                + scale * (speckXDistanceThresholdCeiling - speckXDistanceThresholdFloor));
        int speckYDistanceThreshold = (int) Math.ceil(speckYDistanceThresholdFloor
                + scale * (speckYDistanceThresholdCeiling - speckYDistanceThresholdFloor));

        LOG.debug("speckHeight=" + speckHeight);
        LOG.debug("speckWidth=" + speckWidth);
        LOG.debug("speckXDistanceThreshold=" + speckXDistanceThreshold);
        LOG.debug("speckYDistanceThreshold=" + speckYDistanceThreshold);

        Shape nearestShape = null;
        double minDistance = 0.0;
        int nearestShapeXDiff = 0;
        int nearestShapeYDiff = 0;

        for (Shape otherShape : shapes) {
            // limit to nearby shapes
            if (otherShape.getTop() > clusterBottom + speckYDistanceThreshold + 1)
                break;
            if (otherShape.getBottom() < clusterTop - speckYDistanceThreshold - 1)
                continue;
            if (otherShape.getRight() < clusterLeft - speckXDistanceThreshold - 1)
                continue;
            if (otherShape.getLeft() > clusterRight + speckXDistanceThreshold + 1)
                continue;

            // Note: tried !specks.contains(otherShape), but sometimes we have a valid case
            // where a diacritic is "split" into two specks
            if (!specks.contains(otherShape)) {
                int xDiff = 0;
                int yDiff = 0;
                int leftDiff = 0;
                int rightDiff = 0;
                int topDiff = 0;
                int botDiff = 0;

                if (otherShape.getLeft() <= clusterRight && otherShape.getRight() >= clusterLeft) {
                    xDiff = 0;
                } else {
                    leftDiff = Math.abs(clusterLeft - otherShape.getRight());
                    rightDiff = Math.abs(clusterRight - otherShape.getLeft());
                    xDiff = (leftDiff < rightDiff) ? leftDiff : rightDiff;
                }

                if (otherShape.getTop() <= clusterBottom && otherShape.getBottom() >= clusterTop) {
                    yDiff = 0;
                } else {
                    int nearestTop = (otherShape.getTop() > otherShape.getTop() + otherShape.getMeanLine())
                            ? otherShape.getTop() + otherShape.getMeanLine()
                            : otherShape.getTop();
                    int nearestBot = (otherShape.getBottom() < otherShape.getTop() + otherShape.getBaseLine())
                            ? otherShape.getTop() + otherShape.getBaseLine()
                            : otherShape.getBottom();
                    topDiff = Math.abs(clusterTop - nearestBot);
                    botDiff = Math.abs(clusterBottom - nearestTop);
                    yDiff = (topDiff < botDiff) ? topDiff : botDiff;
                }

                double distance = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff));

                if (nearestShape == null || distance < minDistance) {
                    nearestShape = otherShape;
                    minDistance = distance;
                    nearestShapeXDiff = xDiff;
                    nearestShapeYDiff = yDiff;
                    LOG.trace("leftDiff=" + leftDiff + ", rightDiff=" + rightDiff);
                    LOG.trace("topDiff=" + topDiff + ", botDiff=" + botDiff);
                } // found closer shape?
            } // is this the speck?
        } // loop shapes around the reference shape

        if (nearestShape != null) {
            LOG.trace("Nearest shape, top(" + nearestShape.getTop() + ") " + "left(" + nearestShape.getLeft()
                    + ") " + "bot(" + nearestShape.getBottom() + ") " + "right(" + nearestShape.getRight()
                    + ")");
            LOG.trace("Distance=" + minDistance + ", xDiff=" + nearestShapeXDiff + ", yDiff="
                    + nearestShapeYDiff);
        }
        boolean removeSpecks = false;
        if (nearestShape == null)
            removeSpecks = true;
        else {
            // calculate the shortest distance from the nearest shape to the speck cluster
            for (Shape speck : speckCluster) {
                int xDiff = 0;
                int yDiff = 0;
                int leftDiff = 0;
                int rightDiff = 0;
                int topDiff = 0;
                int botDiff = 0;

                if (nearestShape.getLeft() <= speck.getRight() && nearestShape.getRight() >= speck.getLeft()) {
                    xDiff = 0;
                } else {
                    leftDiff = Math.abs(speck.getLeft() - nearestShape.getRight());
                    rightDiff = Math.abs(speck.getRight() - nearestShape.getLeft());
                    xDiff = (leftDiff < rightDiff) ? leftDiff : rightDiff;
                }

                if (nearestShape.getTop() <= speck.getBottom() && nearestShape.getBottom() >= speck.getTop()) {
                    yDiff = 0;
                } else {
                    int nearestTop = (nearestShape.getTop() > nearestShape.getTop()
                            + nearestShape.getMeanLine()) ? nearestShape.getTop() + nearestShape.getMeanLine()
                                    : nearestShape.getTop();
                    int nearestBot = (nearestShape.getBottom() < nearestShape.getTop()
                            + nearestShape.getBaseLine()) ? nearestShape.getTop() + nearestShape.getBaseLine()
                                    : nearestShape.getBottom();
                    topDiff = Math.abs(speck.getTop() - nearestBot);
                    botDiff = Math.abs(speck.getBottom() - nearestTop);
                    yDiff = (topDiff < botDiff) ? topDiff : botDiff;
                }

                double distance = Math.sqrt((xDiff * xDiff) + (yDiff * yDiff));

                if (distance < minDistance) {
                    minDistance = distance;
                    nearestShapeXDiff = xDiff;
                    nearestShapeYDiff = yDiff;
                    LOG.debug("Found closer speck:");
                    LOG.debug("leftDiff=" + leftDiff + ", rightDiff=" + rightDiff);
                    LOG.debug("topDiff=" + topDiff + ", botDiff=" + botDiff);
                } // found closer shape?
            }
            // Then, for all of these specks, find the one that's closest to the nearest non-speck
            // if this distance > threshold, get rid of all of 'em
            // otherwise, keep 'em all
            if (nearestShapeXDiff > speckXDistanceThreshold || nearestShapeYDiff > speckYDistanceThreshold)
                removeSpecks = true;
        }
        if (removeSpecks) {
            for (Shape otherSpeck : speckCluster) {
                LOG.debug("Removing speck " + otherSpeck);
                specksToRemove.add(otherSpeck);
            }
        }
    } // next speck

    shapes.removeAll(specksToRemove);
}

From source file:com.joliciel.jochre.graphics.SegmenterImpl.java

/**
 * Detects paragraph splits and assign rows to correct paragraphs.
 * @param sourceImage/*from  www  . java 2s. co m*/
 */
void groupRowsIntoParagraphs(SourceImage sourceImage) {
    LOG.debug("########## groupRowsIntoParagraphs #########");
    // We'll use various possible indicators, including
    // indented start, indented end, and spacing between rows.

    // On pages with a single big paragraph makes it hypersensitive to differences in row-start/row-end
    // This means we cannot use deviation. Instead, we use the average shape width on the page.
    // We also adjust maxLeft & minRight to match the vertical line slope

    // This is now complicated by the possibility of multiple columns

    // Need to take into account a big horizontal space - Pietrushka page 14
    // Find horizontal spaces that go all the way across and are wider than a certain threshold
    // simply do a boolean column and black out everything in a row, than see if there are any remaining spaces above a certain threshold
    // Columns are thus arranged into "areas", separated by white-space.
    boolean[] fullRows = new boolean[sourceImage.getHeight()];
    for (RowOfShapes row : sourceImage.getRows()) {
        for (int y = row.getTop(); y <= row.getBottom(); y++) {
            fullRows[y] = true;
        }
    }
    DescriptiveStatistics rowHeightStats = new DescriptiveStatistics();

    for (RowOfShapes row : sourceImage.getRows()) {
        int height = row.getXHeight();
        rowHeightStats.addValue(height);
    }
    double avgRowHeight = rowHeightStats.getPercentile(50);
    LOG.debug("meanRowHeight: " + avgRowHeight);
    double minHeightForWhiteSpace = avgRowHeight * 1.3;
    LOG.debug("minHeightForWhiteSpace: " + minHeightForWhiteSpace);

    // find the "white rows" - any horizontal white space
    // in the page which is sufficiently high
    List<int[]> whiteRows = new ArrayList<int[]>();
    boolean inWhite = false;
    int startWhite = 0;
    for (int y = 0; y < sourceImage.getHeight(); y++) {
        if (!inWhite && !fullRows[y]) {
            inWhite = true;
            startWhite = y;
        } else if (inWhite && fullRows[y]) {
            int length = y - startWhite;
            if (length > minHeightForWhiteSpace) {
                LOG.debug("Adding whiteRow " + startWhite + "," + (y - 1));
                whiteRows.add(new int[] { startWhite, y - 1 });
            }
            inWhite = false;
        }
    }
    if (inWhite)
        whiteRows.add(new int[] { startWhite, sourceImage.getHeight() - 1 });
    whiteRows.add(new int[] { sourceImage.getHeight(), sourceImage.getHeight() });

    // place rows in "areas" defined by the "white rows" found above
    List<List<RowOfShapes>> areas = new ArrayList<List<RowOfShapes>>();
    int startY = -1;
    for (int[] whiteRow : whiteRows) {
        List<RowOfShapes> area = new ArrayList<RowOfShapes>();
        for (RowOfShapes row : sourceImage.getRows()) {
            if (row.getTop() >= startY && row.getBottom() <= whiteRow[0]) {
                area.add(row);
            }
        }
        if (area.size() > 0) {
            areas.add(area);
        }
        startY = whiteRow[1];
    }

    // break up each area into vertical columns
    LOG.debug("break up each area into vertical columns");
    List<Column> columns = new ArrayList<Column>();
    List<List<Column>> columnsPerAreaList = new ArrayList<List<Column>>();
    for (List<RowOfShapes> area : areas) {
        LOG.debug("Next area");
        List<Column> columnsPerArea = new ArrayList<SegmenterImpl.Column>();
        columnsPerAreaList.add(columnsPerArea);
        TreeSet<RowOfShapes> rows = new TreeSet<RowOfShapes>(new RowOfShapesVerticalLocationComparator());
        rows.addAll(area);
        for (RowOfShapes row : rows) {
            // try to place this row in one of the columns directly above it.
            // this means that a row which overlaps more than one column has to "close" this column, so it is no longer considered
            List<Column> overlappingColumns = new ArrayList<Column>();
            for (Column column : columnsPerArea) {
                if (!column.closed) {
                    RowOfShapes lastRowInColumn = column.get(column.size() - 1);
                    if (row.getRight() - row.getXAdjustment() >= lastRowInColumn.getLeft()
                            - lastRowInColumn.getXAdjustment()
                            && row.getLeft() - row.getXAdjustment() <= lastRowInColumn.getRight()
                                    - lastRowInColumn.getXAdjustment()) {
                        overlappingColumns.add(column);
                    }
                }
            }
            if (overlappingColumns.size() == 1) {
                Column myColumn = overlappingColumns.get(0);
                RowOfShapes lastRowInMyColumn = myColumn.get(0);

                // close any columns that are now at a distance of more than one row
                for (Column column : columnsPerArea) {
                    if (!column.closed && !column.equals(myColumn)) {
                        RowOfShapes lastRowInColumn = column.get(column.size() - 1);
                        if (lastRowInMyColumn.getTop() > lastRowInColumn.getBottom()) {
                            column.closed = true;
                            LOG.debug("Closing distant column " + lastRowInColumn);
                        }
                    }
                }

                myColumn.add(row);
                LOG.debug(row.toString());
                LOG.debug("  added to column " + lastRowInMyColumn);
            } else {
                for (Column overlappingColumn : overlappingColumns) {
                    overlappingColumn.closed = true;
                    RowOfShapes lastRowInColumn = overlappingColumn.get(overlappingColumn.size() - 1);
                    LOG.debug("Closing overlapping column " + lastRowInColumn);
                }
                Column myColumn = new Column(sourceImage);
                myColumn.add(row);
                LOG.debug("Found new column");
                LOG.debug(row.toString());
                columns.add(myColumn);
                columnsPerArea.add(myColumn);
            }
        }
    } // next area

    for (Column column : columns)
        column.recalculate();

    // Intermediate step to reform the vertical columns, if they exist
    // basically the idea is that if the columns are aligned vertically, then the thresholds for paragraph indents
    // should be shared, to increase the statistical sample size and reduce anomalies.
    // We'll assume that two columns from two consecutive areas are in the same vertical group if they overlap with each other horizontally
    // and don't overlap with any other column in the other column's area.
    List<List<Column>> columnGroups = new ArrayList<List<Column>>();
    List<Column> columnsInPrevArea = null;
    for (List<Column> columnsPerArea : columnsPerAreaList) {
        if (columnsInPrevArea != null) {
            for (Column prevColumn : columnsInPrevArea) {
                LOG.debug("Checking " + prevColumn);
                // find the column group containing the previous column
                List<Column> myColumnGroup = null;
                for (List<Column> columnGroup : columnGroups) {
                    if (columnGroup.contains(prevColumn)) {
                        myColumnGroup = columnGroup;
                        break;
                    }
                }
                if (myColumnGroup == null) {
                    myColumnGroup = new ArrayList<SegmenterImpl.Column>();
                    LOG.debug("Creating column group for column " + prevColumn.toString());
                    columnGroups.add(myColumnGroup);
                    myColumnGroup.add(prevColumn);
                }

                // does only one column overlap with this one?
                Column overlappingColumn = null;
                for (Column column : columnsPerArea) {
                    if (column.adjustedRight >= prevColumn.adjustedLeft
                            && column.adjustedLeft <= prevColumn.adjustedRight) {
                        if (overlappingColumn == null) {
                            LOG.debug("I overlap with " + column);

                            overlappingColumn = column;
                        } else {
                            LOG.debug("But I overlap also with " + column);

                            overlappingColumn = null;
                            break;
                        }
                    }
                }
                if (overlappingColumn != null) {
                    // does it overlap with only me?
                    for (Column otherPrevColumn : columnsInPrevArea) {
                        if (otherPrevColumn.equals(prevColumn))
                            continue;
                        if (overlappingColumn.adjustedRight >= otherPrevColumn.adjustedLeft
                                && overlappingColumn.adjustedLeft <= otherPrevColumn.adjustedRight) {
                            LOG.debug("But it overlaps also with " + otherPrevColumn);
                            overlappingColumn = null;
                            break;
                        }
                    }
                }
                if (overlappingColumn != null) {
                    myColumnGroup.add(overlappingColumn);
                    LOG.debug("Adding " + overlappingColumn);
                    LOG.debug(" to group with " + prevColumn);
                }

            } // next previous column
        } // have previous columns
        columnsInPrevArea = columnsPerArea;
    } // next area
    if (columnsInPrevArea != null) {
        for (Column prevColumn : columnsInPrevArea) {
            // find the column group containing the previous column
            List<Column> myColumnGroup = null;
            for (List<Column> columnGroup : columnGroups) {
                if (columnGroup.contains(prevColumn)) {
                    myColumnGroup = columnGroup;
                    break;
                }
            }
            if (myColumnGroup == null) {
                myColumnGroup = new ArrayList<SegmenterImpl.Column>();
                LOG.debug("Creating column group for column " + prevColumn.toString());
                columnGroups.add(myColumnGroup);
                myColumnGroup.add(prevColumn);
            }
        }
    }

    // What we really want here is, for each column (in the case of right-to-left),
    // two clusters on the right
    // and one relatively big cluster on the left.
    // anything outside of the cluster on the left is an EOP.
    boolean hasTab = false;
    for (List<Column> columnGroup : columnGroups) {
        LOG.debug("Next column group");
        double averageShapeWidth = sourceImage.getAverageShapeWidth();
        LOG.debug("averageShapeWidth: " + averageShapeWidth);
        double epsilon = averageShapeWidth / 2.0;
        LOG.debug("epsilon: " + epsilon);

        int columnGroupTop = sourceImage.getHeight();
        int columnGroupBottom = 0;
        int columnGroupLeft = sourceImage.getWidth();
        int columnGroupRight = 0;
        for (Column column : columnGroup) {
            if (column.top < columnGroupTop)
                columnGroupTop = (int) Math.round(column.top);
            if (column.bottom > columnGroupBottom)
                columnGroupBottom = (int) Math.round(column.bottom);
            if (column.adjustedLeft < columnGroupLeft)
                columnGroupLeft = (int) Math.round(column.adjustedLeft);
            if (column.adjustedRight > columnGroupRight)
                columnGroupRight = (int) Math.round(column.adjustedRight);
        }

        // right thresholds
        LOG.debug("Calculating right thresholds");

        // first, create a DBScan cluster of all rows by their adjusted right coordinate
        List<RowOfShapes> rightHandRows = new ArrayList<RowOfShapes>();
        List<double[]> rightCoordinates = new ArrayList<double[]>();

        for (Column column : columnGroup) {
            for (RowOfShapes row : column) {
                double right = row.getRight() - row.getXAdjustment();
                //               double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage);
                //               if (rightOverlap==0) {
                //                  // leave out any right-overlapping rows here
                //                  // since we need accurate statistics for margin detection
                //               // This is questionable - especially since a long vertical bar (see Petriushka)
                //               // tends to give all rows a left overlap. Also, because the overlap is calculated based
                //               // on the mean right & mean left, not based on any sort of margin clusters.
                //                  rightHandRows.add(row);
                //                  rightCoordinates.add(new double[] {right});
                //               }
                rightHandRows.add(row);
                rightCoordinates.add(new double[] { right });

            }
        }

        int minCardinalityForRightMargin = 5;
        DBSCANClusterer<RowOfShapes> rightMarginClusterer = new DBSCANClusterer<RowOfShapes>(rightHandRows,
                rightCoordinates);
        Set<Set<RowOfShapes>> rowClusters = rightMarginClusterer.cluster(epsilon, minCardinalityForRightMargin,
                true);

        TreeSet<Set<RowOfShapes>> orderedRowClusters = new TreeSet<Set<RowOfShapes>>(
                new CardinalityComparator<RowOfShapes>());
        orderedRowClusters.addAll(rowClusters);

        int i = 0;

        // find the two right-most clusters, and assume they are the margin & the tab
        DescriptiveStatistics rightMarginStats = null;
        DescriptiveStatistics rightTabStats = null;
        for (Set<RowOfShapes> cluster : orderedRowClusters) {
            DescriptiveStatistics rightStats = new DescriptiveStatistics();
            MeanAbsoluteDeviation rightDev = new MeanAbsoluteDeviation();
            for (RowOfShapes row : cluster) {
                int rowIndex = rightHandRows.indexOf(row);
                double right = rightCoordinates.get(rowIndex)[0];
                rightStats.addValue(right);
                rightDev.increment(right);
            }

            LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
            LOG.debug("Right mean : " + rightStats.getMean());
            LOG.debug("Right dev: " + rightDev.getResult());

            if (cluster.size() >= minCardinalityForRightMargin) {
                if (rightMarginStats == null || rightMarginStats.getMean() < rightStats.getMean()) {
                    if (rightMarginStats != null)
                        rightTabStats = rightMarginStats;
                    rightMarginStats = rightStats;
                } else if (rightTabStats == null || rightTabStats.getMean() < rightStats.getMean()) {
                    rightTabStats = rightStats;
                }
            } else {
                break;
            }
            i++;
        } // next right-coordinate cluster

        double rightMargin = sourceImage.getWidth();
        double rightTab = sourceImage.getWidth();
        if (rightMarginStats != null) {
            rightMargin = rightMarginStats.getMean();
        } else {
            List<Rectangle> columnSeparators = sourceImage.findColumnSeparators();
            for (Rectangle columnSeparator : columnSeparators) {
                if (columnSeparator.getTop() <= columnGroupTop
                        && columnSeparator.getBottom() >= columnGroupBottom
                        && columnSeparator.getLeft() >= columnGroupRight) {
                    if (columnSeparator.getLeft() < rightMargin)
                        rightMargin = columnSeparator.getLeft();
                }
            }
        }
        if (rightTabStats != null) {
            rightTab = rightTabStats.getMean();
        }

        LOG.debug("rightMargin: " + rightMargin);
        LOG.debug("rightTab: " + rightTab);

        // left thresholds
        LOG.debug("Calculating left thresholds");

        // first, create a DBScan cluster of all rows by their adjusted left coordinate
        List<RowOfShapes> leftHandRows = new ArrayList<RowOfShapes>();
        List<double[]> leftCoordinates = new ArrayList<double[]>();

        for (Column column : columnGroup) {
            for (RowOfShapes row : column) {
                double left = row.getLeft() - row.getXAdjustment();
                //               double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage);
                //               if (leftOverlap == 0) {
                //                  // leave out any overlapping rows from margin calcs,
                //                  // since we need accurate statistics here
                //                  leftHandRows.add(row);
                //                  leftCoordinates.add(new double[] {left});
                //               }
                leftHandRows.add(row);
                leftCoordinates.add(new double[] { left });
            }
        }

        int minCardinalityForLeftMargin = 5;
        DBSCANClusterer<RowOfShapes> leftMarginClusterer = new DBSCANClusterer<RowOfShapes>(leftHandRows,
                leftCoordinates);
        Set<Set<RowOfShapes>> leftRowClusters = leftMarginClusterer.cluster(epsilon,
                minCardinalityForLeftMargin, true);

        TreeSet<Set<RowOfShapes>> orderedLeftRowClusters = new TreeSet<Set<RowOfShapes>>(
                new CardinalityComparator<RowOfShapes>());
        orderedLeftRowClusters.addAll(leftRowClusters);

        i = 0;

        // find the two left-most clusters, and assume they are the margin & the tab
        DescriptiveStatistics leftMarginStats = null;
        DescriptiveStatistics leftTabStats = null;
        for (Set<RowOfShapes> cluster : orderedLeftRowClusters) {
            DescriptiveStatistics leftStats = new DescriptiveStatistics();
            MeanAbsoluteDeviation leftDev = new MeanAbsoluteDeviation();
            for (RowOfShapes row : cluster) {
                int rowIndex = leftHandRows.indexOf(row);
                double left = leftCoordinates.get(rowIndex)[0];
                leftStats.addValue(left);
                leftDev.increment(left);
            }

            LOG.debug("Cluster " + i + ". Cardinality=" + cluster.size());
            LOG.debug("Left mean : " + leftStats.getMean());
            LOG.debug("Left dev: " + leftDev.getResult());

            if (cluster.size() >= minCardinalityForLeftMargin) {
                if (leftMarginStats == null || leftMarginStats.getMean() > leftStats.getMean()) {
                    if (leftMarginStats != null)
                        leftTabStats = leftMarginStats;
                    leftMarginStats = leftStats;
                } else if (leftTabStats == null || leftTabStats.getMean() > leftStats.getMean()) {
                    leftTabStats = leftStats;
                }
            } else {
                break;
            }
            i++;
        } // next left-coordinate cluster

        double leftMargin = 0;
        double leftTab = 0;
        if (leftMarginStats != null) {
            leftMargin = leftMarginStats.getMean();
        } else {
            List<Rectangle> columnSeparators = sourceImage.findColumnSeparators();
            for (Rectangle columnSeparator : columnSeparators) {
                if (columnSeparator.getTop() <= columnGroupTop
                        && columnSeparator.getBottom() >= columnGroupBottom
                        && columnSeparator.getRight() <= columnGroupLeft) {
                    if (columnSeparator.getRight() > leftMargin)
                        leftMargin = columnSeparator.getRight();
                }
            }
        }
        if (leftTabStats != null) {
            leftTab = leftTabStats.getMean();
        }

        LOG.debug("leftMargin: " + leftMargin);
        LOG.debug("leftTab: " + leftTab);

        for (Column column : columnGroup) {
            if (sourceImage.isLeftToRight()) {
                column.startMargin = leftMargin;
                if (leftTabStats != null) {
                    column.startTab = leftTab;
                    column.hasTab = true;
                } else {
                    LOG.debug("No left tab - setting based on left margin");
                    column.startTab = leftMargin + (5.0 * sourceImage.getAverageShapeWidth());
                    column.hasTab = false;
                }

                column.endMargin = rightMargin;
            } else {
                column.startMargin = rightMargin;
                if (rightTabStats != null) {
                    column.startTab = rightTab;
                    column.hasTab = true;
                } else {
                    LOG.debug("No right tab - setting based on right margin");
                    column.startTab = rightMargin - (5.0 * sourceImage.getAverageShapeWidth());
                    column.hasTab = false;
                }

                column.endMargin = leftMargin;
            }
            LOG.debug("Margins for " + column);
            LOG.debug("startMargin: " + column.startMargin);
            LOG.debug("startTab: " + column.startTab);
            LOG.debug("endMargin: " + column.endMargin);
        } // next column
    } // next column group
    LOG.debug("hasTab: " + hasTab);

    double safetyMargin = 1.5 * sourceImage.getAverageShapeWidth();

    // Now, paragraphs are either "indented", "outdented" or not "dented" at all (no tabs).
    // This applies to the entire page.
    // To recognise indenting vs. outdenting, we have to see if the row preceding each
    // indent/outdent is full or partial. In the case of indentation, partial rows will
    // typically be followed by an indent. In the case of outdentation, partial rows will
    // typically be followed by an outdent.
    boolean isIndented = true;

    int indentCount = 0;
    int outdentCount = 0;
    for (List<Column> columnGroup : columnGroups) {
        LOG.debug("Next column group");
        boolean prevRowPartial = false;
        for (Column column : columnGroup) {
            if (column.hasTab) {
                for (RowOfShapes row : column) {
                    if (sourceImage.isLeftToRight()) {
                        if (prevRowPartial) {
                            if (row.getLeft() - row.getXAdjustment() > column.startTab - safetyMargin) {
                                indentCount++;
                            } else if (row.getLeft() - row.getXAdjustment() < column.startMargin
                                    + safetyMargin) {
                                outdentCount++;
                            }
                        }
                        if (row.getRight() - row.getXAdjustment() < column.endMargin - safetyMargin) {
                            prevRowPartial = true;
                        } else {
                            prevRowPartial = false;
                        }
                    } else {
                        if (prevRowPartial) {
                            if (row.getRight() - row.getXAdjustment() < column.startTab + safetyMargin) {
                                indentCount++;
                            } else if (row.getRight() - row.getXAdjustment() > column.startMargin
                                    - safetyMargin) {
                                outdentCount++;
                            }
                        }
                        if (row.getLeft() - row.getXAdjustment() > column.endMargin + safetyMargin) {
                            prevRowPartial = true;
                        } else {
                            prevRowPartial = false;
                        }
                    } // left-to-right?
                } // next row  
            } // column has tab
        } // next column
    } // next column group
    isIndented = (indentCount + 2 >= outdentCount);
    LOG.debug("indentCount: " + indentCount);
    LOG.debug("outdentCount: " + outdentCount);
    LOG.debug("isIndented: " + isIndented);

    // order the columns
    TreeSet<Column> orderedColumns = new TreeSet<SegmenterImpl.Column>(columns);
    columns.clear();
    columns.addAll(orderedColumns);

    // find the paragraphs found in each column
    for (Column column : columns) {
        LOG.debug("--- Next column ---");

        // break up the column into paragraphs 
        Paragraph paragraph = null;
        RowOfShapes previousRow = null;
        int maxShapesForStandaloneParagraph = 2;
        List<RowOfShapes> rowsForStandaloneParagraphs = new ArrayList<RowOfShapes>();
        Point2D previousPointStartMargin = null;
        Point2D previousPointStartTab = null;
        Point2D previousPointEndMargin = null;

        for (RowOfShapes row : column) {
            boolean rowForStandaloneParagraph = false;
            boolean newParagraph = false;
            if (row.getShapes().size() <= maxShapesForStandaloneParagraph) {
                rowsForStandaloneParagraphs.add(row);
                rowForStandaloneParagraph = true;
            } else {
                double rightOverlap = this.findLargeShapeOverlapOnRight(row, column, sourceImage);
                double leftOverlap = this.findLargeShapeOverlapOnLeft(row, column, sourceImage);

                if (drawSegmentation) {
                    double rowVerticalMidPoint = row.getBaseLineMiddlePoint();
                    double startMarginX = column.startMargin + row.getXAdjustment();
                    double startTabX = column.startTab + row.getXAdjustment();
                    double endMarginX = column.endMargin + row.getXAdjustment();

                    if (sourceImage.isLeftToRight()) {
                        startMarginX += safetyMargin;
                        startTabX -= safetyMargin;
                        endMarginX -= safetyMargin;

                        startMarginX += leftOverlap;
                        startTabX += leftOverlap;
                        endMarginX -= rightOverlap;
                    } else {
                        startMarginX -= safetyMargin;
                        startTabX += safetyMargin;
                        endMarginX += safetyMargin;

                        startMarginX -= rightOverlap;
                        startTabX -= rightOverlap;
                        endMarginX += leftOverlap;
                    }

                    Point2D.Double currentPointStartMargin = new Point2D.Double(startMarginX,
                            rowVerticalMidPoint);
                    Point2D.Double currentPointStartTab = new Point2D.Double(startTabX, rowVerticalMidPoint);
                    Point2D.Double currentPointEndMargin = new Point2D.Double(endMarginX, rowVerticalMidPoint);

                    if (previousPointStartMargin != null) {
                        graphics2D.setStroke(new BasicStroke(1));
                        graphics2D.setPaint(Color.BLUE);
                        graphics2D.drawLine((int) Math.round(previousPointStartMargin.getX()),
                                (int) Math.round(previousPointStartMargin.getY()),
                                (int) Math.round(currentPointStartMargin.getX()),
                                (int) Math.round(currentPointStartMargin.getY()));
                        graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()),
                                (int) Math.round(previousPointEndMargin.getY()),
                                (int) Math.round(currentPointEndMargin.getX()),
                                (int) Math.round(currentPointEndMargin.getY()));

                        graphics2D.setPaint(Color.RED);
                        graphics2D.drawLine((int) Math.round(previousPointStartTab.getX()),
                                (int) Math.round(previousPointStartTab.getY()),
                                (int) Math.round(currentPointStartTab.getX()),
                                (int) Math.round(currentPointStartTab.getY()));

                        graphics2D.setPaint(Color.RED);
                        graphics2D.drawLine((int) Math.round(previousPointEndMargin.getX()),
                                (int) Math.round(previousPointEndMargin.getY()),
                                (int) Math.round(currentPointEndMargin.getX()),
                                (int) Math.round(currentPointEndMargin.getY()));
                    }
                    previousPointStartMargin = currentPointStartMargin;
                    previousPointStartTab = currentPointStartTab;
                    previousPointEndMargin = currentPointEndMargin;
                }

                if (previousRow == null) {
                    LOG.debug("New paragraph (first)");
                    newParagraph = true;
                } else {
                    if (sourceImage.isLeftToRight()) {
                        if (previousRow.getRight() - previousRow.getXAdjustment()
                                - rightOverlap < column.endMargin - safetyMargin) {
                            LOG.debug("New paragraph (previous EOP)");
                            newParagraph = true;
                        } else if (column.hasTab && isIndented && row.getLeft() - row.getXAdjustment()
                                + leftOverlap > column.startTab - safetyMargin) {
                            LOG.debug("New paragraph (indent)");
                            newParagraph = true;
                        } else if (column.hasTab && !isIndented && row.getLeft() - row.getXAdjustment()
                                + leftOverlap < column.startMargin + safetyMargin) {
                            LOG.debug("New paragraph (outdent)");
                            newParagraph = true;
                        }
                    } else {
                        if (previousRow.getLeft() - previousRow.getXAdjustment()
                                + leftOverlap > column.endMargin + safetyMargin) {
                            LOG.debug("New paragraph (previous EOP)");
                            newParagraph = true;
                        } else if (column.hasTab && isIndented && row.getRight() - row.getXAdjustment()
                                - rightOverlap < column.startTab + safetyMargin) {
                            LOG.debug("New paragraph (indent)");
                            newParagraph = true;
                        } else if (column.hasTab && !isIndented && row.getRight() - row.getXAdjustment()
                                - rightOverlap > column.startMargin - safetyMargin) {
                            LOG.debug("New paragraph (outdent)");
                            newParagraph = true;
                        }
                    } // left-to-right?
                } // have previous row
            } // standalone paragraph?

            if (!rowForStandaloneParagraph)
                LOG.debug(row.toString());

            if (newParagraph) {
                if (rowsForStandaloneParagraphs.size() > 0) {
                    for (RowOfShapes oneRow : rowsForStandaloneParagraphs) {
                        LOG.debug("Standalone paragraph");
                        LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop()
                                + "), right(" + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")");
                        Paragraph standaloneParagraph = sourceImage.newParagraph();
                        standaloneParagraph.getRows().add(oneRow);
                    }
                    rowsForStandaloneParagraphs.clear();
                }
                paragraph = sourceImage.newParagraph();
            }
            //LOG.debug("Row: left(" + row.getLeft() + "), right(" + row.getRight() + "), width(" + (row.getRight() - row.getLeft() + 1) + ")");

            if (!rowForStandaloneParagraph) {
                paragraph.getRows().add(row);
                previousRow = row;
            }
        } // next row in column
        if (rowsForStandaloneParagraphs.size() > 0) {
            for (RowOfShapes oneRow : rowsForStandaloneParagraphs) {
                LOG.debug("Standalone paragraph");
                LOG.debug("Standalone row: left(" + oneRow.getLeft() + "), top(" + oneRow.getTop() + "), right("
                        + oneRow.getRight() + "), bottom(" + oneRow.getBottom() + ")");
                Paragraph standaloneParagraph = sourceImage.newParagraph();
                standaloneParagraph.getRows().add(oneRow);
            }
            rowsForStandaloneParagraphs.clear();
        }
    } // next column

}

From source file:org.apache.jackrabbit.performance.AbstractPerformanceTest.java

private DescriptiveStatistics runTest(AbstractTest test, Repository repository) throws Exception {
    DescriptiveStatistics statistics = new DescriptiveStatistics();

    test.setUp(repository, credentials);
    try {// ww  w. j a  va2  s .  c  o m
        // Run a few iterations to warm up the system
        long warmupEnd = System.currentTimeMillis() + warmup * 1000;
        while (System.currentTimeMillis() < warmupEnd) {
            test.execute();
        }

        // Run test iterations, and capture the execution times
        long runtimeEnd = System.currentTimeMillis() + runtime * 1000;
        while (System.currentTimeMillis() < runtimeEnd) {
            statistics.addValue(test.execute());
        }
    } finally {
        test.tearDown();
    }

    return statistics;
}

From source file:org.apache.sling.performance.FrameworkPerformanceMethod.java

/**
 * Method that runs 1 invocation of the timed test method
 *
 * @param testMethodToInvoke/*from   w w w  .j a va  2s .  c o m*/
 *            the test method to invoke
 * @param statistics
 *            the statistics object that collects the results
 * @param params
 *            the parameters for the invocation of the test method
 * @return the response from the method invocation
 * @throws Throwable
 */
private Object invokeTimedTestMethod(Method testMethodToInvoke, DescriptiveStatistics statistics,
        Object... params) throws Throwable {

    Object response = null;

    recursiveCallSpecificMethod(this.target.getClass(), this.target, BeforeMethodInvocation.class);

    // TODO: implement the method to run a before a specific test method
    // recursiveCallSpecificMethod(this.target.getClass(), this.target,
    // BeforeSpecificTest.class);

    // timing the test method execution
    // System.out.println("Start test: " + testMethodToInvoke.getName());
    long start = System.nanoTime();
    response = super.invokeExplosively(this.target, params);
    long timeMilliseconds = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    statistics.addValue(timeMilliseconds);

    // System.out.println("End test: " + testMethodToInvoke.getName());

    // System.out.println("Test execution time (ms): " + timeMilliseconds);

    // TODO: implement the method to run a after a specific test method
    // recursiveCallSpecificMethod(this.target.getClass(), this.target,
    // AfterSpecificTest.class);

    recursiveCallSpecificMethod(this.target.getClass(), this.target, AfterMethodInvocation.class);

    return response;
}