List of usage examples for org.apache.commons.math.stat.descriptive DescriptiveStatistics DescriptiveStatistics
public DescriptiveStatistics()
From source file:com.joliciel.jochre.graphics.SourceImageImpl.java
void calculateShapeStatistics() { if (!shapeStatisticsCalculated) { DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics(); for (RowOfShapes row : this.getRows()) { for (Shape shape : row.getShapes()) { shapeWidthStats.addValue(shape.getWidth()); shapeHeightStats.addValue(shape.getHeight()); }//from w w w.j a va2 s . co m } double minWidth = shapeWidthStats.getPercentile(50); double maxWidth = shapeWidthStats.getPercentile(80); double minHeight = shapeHeightStats.getPercentile(50); double maxHeight = shapeHeightStats.getPercentile(80); this.averageShapeWidth = shapeWidthStats.getPercentile(65); this.averageShapeHeight = shapeHeightStats.getPercentile(65); this.averageShapeWidthMargin = (maxWidth - minWidth) / 2.0; this.averageShapeHeightMargin = (maxHeight - minHeight) / 2.0; this.shapeStatisticsCalculated = true; } }
From source file:net.sf.mzmine.modules.peaklistmethods.dataanalysis.heatmaps.HeatMapTask.java
private void scale(double[][] peakList) { DescriptiveStatistics stdDevStats = new DescriptiveStatistics(); for (int columns = 0; columns < peakList.length; columns++) { stdDevStats.clear();// w w w. j ava 2s .c o m for (int row = 0; row < peakList[columns].length; row++) { if (!Double.isInfinite(peakList[columns][row]) && !Double.isNaN(peakList[columns][row])) { stdDevStats.addValue(peakList[columns][row]); } } double stdDev = stdDevStats.getStandardDeviation(); for (int row = 0; row < peakList[columns].length; row++) { if (stdDev != 0) { peakList[columns][row] = peakList[columns][row] / stdDev; } } } }
From source file:com.joliciel.jochre.graphics.RowOfShapesImpl.java
void calculateShapeStatistics() { if (!shapeStatisticsCalculated) { DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics(); for (Shape shape : this.getShapes()) { shapeWidthStats.addValue(shape.getWidth()); shapeHeightStats.addValue(shape.getHeight()); }// www .jav a 2 s . co m double minWidth = shapeWidthStats.getPercentile(33); double maxWidth = shapeWidthStats.getPercentile(66); double minHeight = shapeHeightStats.getPercentile(33); double maxHeight = shapeHeightStats.getPercentile(66); this.averageShapeWidth = shapeWidthStats.getPercentile(50); this.averageShapeHeight = shapeHeightStats.getPercentile(50); this.averageShapeWidthMargin = (maxWidth - minWidth) / 2.0; this.averageShapeHeightMargin = (maxHeight - minHeight) / 2.0; this.shapeStatisticsCalculated = true; } }
From source file:net.sf.mzmine.modules.peaklistmethods.dataanalysis.heatmaps.HeatMapTask.java
private double[][] groupingDataset(UserParameter selectedParameter, String referenceGroup) { // Collect all data files Vector<RawDataFile> allDataFiles = new Vector<RawDataFile>(); DescriptiveStatistics meanControlStats = new DescriptiveStatistics(); DescriptiveStatistics meanGroupStats = new DescriptiveStatistics(); allDataFiles.addAll(Arrays.asList(peakList.getRawDataFiles())); // Determine the reference group and non reference group (the rest of // the samples) for raw data files List<RawDataFile> referenceDataFiles = new ArrayList<RawDataFile>(); List<RawDataFile> nonReferenceDataFiles = new ArrayList<RawDataFile>(); List<String> groups = new ArrayList<String>(); MZmineProject project = MZmineCore.getCurrentProject(); for (RawDataFile rawDataFile : allDataFiles) { Object paramValue = project.getParameterValue(selectedParameter, rawDataFile); if (!groups.contains(String.valueOf(paramValue))) { groups.add(String.valueOf(paramValue)); }/*from w ww. ja v a 2 s .c o m*/ if (String.valueOf(paramValue).equals(referenceGroup)) { referenceDataFiles.add(rawDataFile); } else { nonReferenceDataFiles.add(rawDataFile); } } int numRows = 0; for (int row = 0; row < peakList.getNumberOfRows(); row++) { if (!onlyIdentified || (onlyIdentified && peakList.getRow(row).getPeakIdentities().length > 0)) { numRows++; } } // Create a new aligned peak list with all the samples if the reference // group has to be shown or with only // the non reference group if not. double[][] dataMatrix = new double[groups.size() - 1][numRows]; pValueMatrix = new String[groups.size() - 1][numRows]; // data files that should be in the heat map List<RawDataFile> shownDataFiles = nonReferenceDataFiles; for (int row = 0, rowIndex = 0; row < peakList.getNumberOfRows(); row++) { PeakListRow rowPeak = peakList.getRow(row); if (!onlyIdentified || (onlyIdentified && rowPeak.getPeakIdentities().length > 0)) { // Average area or height of the reference group meanControlStats.clear(); for (int column = 0; column < referenceDataFiles.size(); column++) { if (rowPeak.getPeak(referenceDataFiles.get(column)) != null) { if (area) { meanControlStats.addValue(rowPeak.getPeak(referenceDataFiles.get(column)).getArea()); } else { meanControlStats.addValue(rowPeak.getPeak(referenceDataFiles.get(column)).getHeight()); } } } // Divide the area or height of each peak by the average of the // area or height of the reference peaks in each row int columnIndex = 0; for (int column = 0; column < groups.size(); column++) { String group = groups.get(column); meanGroupStats.clear(); if (!group.equals(referenceGroup)) { for (int dataColumn = 0; dataColumn < shownDataFiles.size(); dataColumn++) { Object paramValue = project.getParameterValue(selectedParameter, shownDataFiles.get(dataColumn)); if (rowPeak.getPeak(shownDataFiles.get(dataColumn)) != null && String.valueOf(paramValue).equals(group)) { Feature peak = rowPeak.getPeak(shownDataFiles.get(dataColumn)); if (!Double.isInfinite(peak.getArea()) && !Double.isNaN(peak.getArea())) { if (area) { meanGroupStats.addValue(peak.getArea()); } else { meanGroupStats.addValue(peak.getHeight()); } } } } double value = meanGroupStats.getMean() / meanControlStats.getMean(); if (meanGroupStats.getN() > 1 && meanControlStats.getN() > 1) { pValueMatrix[columnIndex][rowIndex] = this.getPvalue(meanGroupStats, meanControlStats); } else { pValueMatrix[columnIndex][rowIndex] = ""; } if (log) { value = Math.log(value); } dataMatrix[columnIndex++][rowIndex] = value; } } rowIndex++; } } // Scale the data dividing the peak area/height by the standard // deviation of each column if (scale) { scale(dataMatrix); } // Create two arrays: row and column names rowNames = new String[dataMatrix[0].length]; colNames = new String[groups.size() - 1]; int columnIndex = 0; for (String group : groups) { if (!group.equals(referenceGroup)) { colNames[columnIndex++] = group; } } for (int row = 0, rowIndex = 0; row < peakList.getNumberOfRows(); row++) { if (!onlyIdentified || (onlyIdentified && peakList.getRow(row).getPeakIdentities().length > 0)) { if (peakList.getRow(row).getPeakIdentities() != null && peakList.getRow(row).getPeakIdentities().length > 0) { rowNames[rowIndex++] = peakList.getRow(row).getPreferredPeakIdentity().getName(); } else { rowNames[rowIndex++] = "Unknown"; } } } return dataMatrix; }
From source file:com.joliciel.jochre.graphics.RowOfShapesImpl.java
/** * The regression passes through the bottom of average shapes on this line. * It gives the line's slope, and a starting point for finding the baseline and meanline. *///from ww w . ja va 2 s .co m public SimpleRegression getRegression() { if (this.regression == null) { // begin by calculating some sort of average line crossing the whole row, so that we can see if the row is // rising or falling to start with? // Calculate the line crossing the mid-point of all "average" shapes on this row // get the "smoothed" linear approximation of the mid-points regression = new SimpleRegression(); int numShapes = 0; int minShapes = 10; DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics(); for (Shape shape : this.getShapes()) { shapeWidthStats.addValue(shape.getWidth()); shapeHeightStats.addValue(shape.getHeight()); } double minWidth = shapeWidthStats.getPercentile(25); double maxWidth = shapeWidthStats.getPercentile(75); double minHeight = shapeHeightStats.getPercentile(25); double maxHeight = shapeHeightStats.getPercentile(75); for (Shape shape : this.getShapes()) { // only add points whose shape is of "average" width and height (to leave out commas, etc.) if (shape.getWidth() >= minWidth && shape.getWidth() <= maxWidth && shape.getHeight() >= minHeight && shape.getHeight() <= maxHeight) { // using bottom only, since rows with different font sizes tend to align bottom regression.addData((((double) shape.getLeft() + (double) shape.getRight()) / 2.0), ((double) shape.getBottom())); numShapes++; } } // special case where row contains very few shapes (generally letter or number + period) boolean horizontalLine = false; if (numShapes < minShapes) { LOG.debug("Too few shapes: " + numShapes + ", assuming straight horizontal line"); horizontalLine = true; } else if ((this.getRight() - this.getLeft()) < (this.getContainer().getWidth() / 6.0)) { LOG.debug("Too narrow: " + (this.getRight() - this.getLeft()) + ", assuming straight horizontal line"); horizontalLine = true; } if (horizontalLine) { // assume a straight horizontal line Mean midPointMean = new Mean(); for (Shape shape : this.getShapes()) { // only add points whose shape is of "average" height (to leave out commas, etc.) if (shape.getWidth() >= minWidth && shape.getWidth() <= maxWidth && shape.getHeight() >= minHeight && shape.getHeight() <= maxHeight) { midPointMean.increment((double) shape.getBottom()); } } if (midPointMean.getN() == 0) { for (Shape shape : this.getShapes()) { midPointMean.increment((double) shape.getBottom()); } } double meanMidPoint = midPointMean.getResult(); regression = new SimpleRegression(); regression.addData(this.getLeft(), meanMidPoint); regression.addData(this.getRight(), meanMidPoint); } // displays intercept of regression line LOG.debug("intercept: " + regression.getIntercept()); // displays slope of regression line LOG.debug("slope: " + regression.getSlope()); // displays slope standard error LOG.debug("std err: " + regression.getSlopeStdErr()); LOG.debug("x = 0, y = " + regression.predict(0)); LOG.debug("x = " + this.getContainer().getWidth() + ", y = " + regression.predict(this.getContainer().getWidth())); } return regression; }
From source file:com.joliciel.jochre.graphics.SourceImageImpl.java
@Override public List<Rectangle> getWhiteAreas(List<Shape> shapes) { LOG.debug("#### getWhiteAreas ####"); // Delimit area to be examined based on shapes int top = Integer.MAX_VALUE, bottom = 0, left = Integer.MAX_VALUE, right = 0; for (Shape shape : shapes) { if (shape.getTop() < top) top = shape.getTop();//from w w w .j a v a 2s . co m if (shape.getBottom() > bottom) bottom = shape.getBottom(); if (shape.getLeft() < left) left = shape.getLeft(); if (shape.getRight() > right) right = shape.getRight(); } // get average shape width & height DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics(); for (Shape shape : shapes) { shapeWidthStats.addValue(shape.getWidth()); shapeHeightStats.addValue(shape.getHeight()); } double averageShapeWidth = shapeWidthStats.getPercentile(75); double averageShapeHeight = shapeHeightStats.getPercentile(75); LOG.debug("averageShapeWidth: " + averageShapeWidth); LOG.debug("averageShapeHeight: " + averageShapeHeight); List<Rectangle> whiteAreas = new ArrayList<Rectangle>(); // Horizontal white areas double minHorizontalWhiteAreaWidth = 40.0 * averageShapeWidth; double minHorizontalWhiteAreaHeight = 2.5 * averageShapeHeight; LOG.debug("minHorizontalWhiteAreaWidth: " + minHorizontalWhiteAreaWidth); LOG.debug("minHorizontalWhiteAreaHeight: " + minHorizontalWhiteAreaHeight); WhiteAreaFinder whiteAreaFinder = new WhiteAreaFinder(); List<Rectangle> blackAreas = new ArrayList<Rectangle>(); blackAreas.addAll(shapes); List<Rectangle> horizontalWhiteAreas = whiteAreaFinder.getWhiteAreas(blackAreas, left, top, right, bottom, minHorizontalWhiteAreaWidth, minHorizontalWhiteAreaHeight); // we add the horizontal white areas to the "black areas", since we don't want vertical // white areas detected at page top & page bottom, splitting a valid row blackAreas.addAll(horizontalWhiteAreas); whiteAreas.addAll(horizontalWhiteAreas); // Long vertical white areas double minVerticalWhiteAreaWidth = 2.5 * averageShapeWidth; double minVerticalWhiteAreaHeight = 10.0 * averageShapeHeight; LOG.debug("minVerticalWhiteAreaWidth: " + minVerticalWhiteAreaWidth); LOG.debug("minVerticalWhiteAreaHeight: " + minVerticalWhiteAreaHeight); List<Rectangle> verticalWhiteAreas = whiteAreaFinder.getWhiteAreas(blackAreas, left, top, right, bottom, minVerticalWhiteAreaWidth, minVerticalWhiteAreaHeight); whiteAreas.addAll(verticalWhiteAreas); // Square white areas double minSquareWhiteAreaWidth = 4.0 * averageShapeWidth; double minSquareWhiteAreaHeight = 4.0 * averageShapeHeight; LOG.debug("minSquareWhiteAreaWidth: " + minSquareWhiteAreaWidth); LOG.debug("minSquareWhiteAreaHeight: " + minSquareWhiteAreaHeight); List<Rectangle> squareWhiteAreas = whiteAreaFinder.getWhiteAreas(blackAreas, left, top, right, bottom, minSquareWhiteAreaWidth, minSquareWhiteAreaHeight); whiteAreas.addAll(squareWhiteAreas); blackAreas.addAll(squareWhiteAreas); blackAreas.addAll(this.getWhiteAreasAroundLargeShapes(shapes)); // Long narrow vertical white areas minVerticalWhiteAreaWidth = 1.0 * averageShapeWidth; minVerticalWhiteAreaHeight = 20.0 * averageShapeHeight; LOG.debug("minVerticalWhiteAreaWidth: " + minVerticalWhiteAreaWidth); LOG.debug("minVerticalWhiteAreaHeight: " + minVerticalWhiteAreaHeight); List<Rectangle> verticalWhiteAreas2 = whiteAreaFinder.getWhiteAreas(blackAreas, left, top, right, bottom, minVerticalWhiteAreaWidth, minVerticalWhiteAreaHeight); whiteAreas.addAll(verticalWhiteAreas2); return whiteAreas; }
From source file:com.joliciel.csvLearner.CSVLearner.java
private void doCommandEvaluate() throws IOException { if (resultFilePath == null) throw new RuntimeException("Missing argument: resultFile"); if (featureDir == null) throw new RuntimeException("Missing argument: featureDir"); if (testIdFilePath != null) { if (crossValidation) throw new RuntimeException("Cannot combine testIdFile with cross validation"); if (testSegment >= 0) { throw new RuntimeException("Cannot combine testIdFile with test segment"); }// ww w . j a v a 2 s. c o m } if (!crossValidation && testIdFilePath == null) { if (testSegment < 0) throw new RuntimeException("Missing argument: testSegment"); if (testSegment > 9) throw new RuntimeException("testSegment must be an integer between 0 and 9"); } if (outDirPath == null) throw new RuntimeException("Missing argument: outDir"); LOG.info("Generating event list from CSV files..."); CSVEventListReader reader = this.getReader(TrainingSetType.TEST_SEGMENT, false); GenericEvents events = reader.getEvents(); File outDir = new File(outDirPath); outDir.mkdirs(); String fileBase = this.featureDir.replace('/', '_'); fileBase = fileBase.replace(':', '_'); fileBase = fileBase + "_cutoff" + cutoff; if (generateEventFile) { File eventFile = new File(outDir, fileBase + "_events.txt"); this.generateEventFile(eventFile, events); } File fscoreFile = new File(outDir, fileBase + "_fscores.csv"); Writer fscoreFileWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(fscoreFile, false), "UTF8")); File outcomeFile = new File(outDir, fileBase + "_outcomes.csv"); Writer outcomeFileWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(outcomeFile, false), "UTF8")); try { if (!crossValidation) { MaxentModel maxentModel = this.train(events, null); this.evaluate(maxentModel, events, fscoreFileWriter, outcomeFileWriter); } else { DescriptiveStatistics accuracyStats = new DescriptiveStatistics(); Map<String, DescriptiveStatistics[]> outcomeFscoreStats = new TreeMap<String, DescriptiveStatistics[]>(); for (int segment = 0; segment <= 9; segment++) { outcomeFileWriter.write("Run " + segment + ",\n"); fscoreFileWriter.write("Run " + segment + ",\n"); if (balanceOutcomes) { for (String outcome : reader.getOutcomes()) { int i = 0; for (GenericEvent event : events) { if (event.getOutcome().equals(outcome)) { boolean test = i % 10 == segment; event.setTest(test); i++; } } } } else { int i = 0; for (GenericEvent event : events) { boolean test = i % 10 == segment; event.setTest(test); i++; } } MaxentModel maxentModel = this.train(events, null); FScoreCalculator<String> fscoreCalculator = this.evaluate(maxentModel, events, fscoreFileWriter, outcomeFileWriter); accuracyStats.addValue(fscoreCalculator.getTotalFScore()); for (String outcome : fscoreCalculator.getOutcomeSet()) { DescriptiveStatistics[] stats = outcomeFscoreStats.get(outcome); if (stats == null) { stats = new DescriptiveStatistics[3]; stats[0] = new DescriptiveStatistics(); stats[1] = new DescriptiveStatistics(); stats[2] = new DescriptiveStatistics(); outcomeFscoreStats.put(outcome, stats); } stats[0].addValue(fscoreCalculator.getPrecision(outcome)); stats[1].addValue(fscoreCalculator.getRecall(outcome)); stats[2].addValue(fscoreCalculator.getFScore(outcome)); } // next outcome outcomeFileWriter.write("\n"); } // next segment fscoreFileWriter.write( "outcome,precision avg., precision dev., recall avg., recall dev., f-score avg., f-score dev.,\n"); for (String outcome : outcomeFscoreStats.keySet()) { DescriptiveStatistics[] stats = outcomeFscoreStats.get(outcome); fscoreFileWriter .write(CSVFormatter.format(outcome) + "," + CSVFormatter.format(stats[0].getMean()) + "," + CSVFormatter.format(stats[0].getStandardDeviation()) + "," + CSVFormatter.format(stats[1].getMean()) + "," + CSVFormatter.format(stats[1].getStandardDeviation()) + "," + CSVFormatter.format(stats[2].getMean()) + "," + CSVFormatter.format(stats[2].getStandardDeviation()) + "," + "\n"); } fscoreFileWriter.write("TOTAL,,,,," + CSVFormatter.format(accuracyStats.getMean()) + "," + CSVFormatter.format(accuracyStats.getStandardDeviation()) + ",\n"); LOG.info("Accuracy mean: " + accuracyStats.getMean()); LOG.info("Accuracy std dev: " + accuracyStats.getStandardDeviation()); } } finally { fscoreFileWriter.flush(); fscoreFileWriter.close(); outcomeFileWriter.flush(); outcomeFileWriter.close(); } LOG.info("#### Complete ####"); }
From source file:com.joliciel.jochre.graphics.RowOfShapesImpl.java
/** * Assign guidelines for a certain subset of shapes, and return the x-height. * @param startShape//from w w w. jav a2 s. c om * @param endShape * @return */ int assignGuideLines(List<GroupOfShapes> groupsToAssign) { LOG.debug("assignGuideLines internal"); double meanHorizontalSlope = this.getContainer().getMeanHorizontalSlope(); // the base-line and mean-line will be at a fixed distance away from the midpoint // the question is, which distance! // To find this out, we count number of black pixels on each row above this line // And then start analysing from the top and the bottom until the number drops off sharply // The notion of "groupsToAssign" is used to only assign guidelines // to a subset of the groups on the line // when the line contains two different font sizes List<Shape> shapes = new ArrayList<Shape>(); if (groupsToAssign != null) { for (GroupOfShapes group : groupsToAssign) { shapes.addAll(group.getShapes()); } } else { shapes = this.getShapes(); } int i = 0; DescriptiveStatistics shapeWidthStats = new DescriptiveStatistics(); DescriptiveStatistics shapeHeightStats = new DescriptiveStatistics(); for (Shape shape : this.getShapes()) { shapeWidthStats.addValue(shape.getWidth()); shapeHeightStats.addValue(shape.getHeight()); } double minWidth = shapeWidthStats.getPercentile(25); double maxWidth = shapeWidthStats.getPercentile(75); double minHeight = shapeHeightStats.getPercentile(45); double maxHeight = shapeHeightStats.getPercentile(75); double rowMidPointX = (double) (this.getLeft() + this.getRight()) / 2.0; // calculating the Y midpoint by the shapes in the row, instead of by the top & bottom of row Mean rowMidPointYMean = new Mean(); for (Shape shape : this.getShapes()) { // only add points whose shape is of "average" width and height (to leave out commas, etc.) if (shape.getWidth() >= minWidth && shape.getWidth() <= maxWidth && shape.getHeight() >= minHeight && shape.getHeight() <= maxHeight) { rowMidPointYMean.increment((double) (shape.getBottom() + shape.getTop()) / 2.0); } } double rowMidPointY = (double) (this.getTop() + this.getBottom()) / 2.0; if (rowMidPointYMean.getN() > 0) rowMidPointY = rowMidPointYMean.getResult(); LOG.debug("rowMidPointX: " + rowMidPointX); LOG.debug("rowMidPointY: " + rowMidPointY); // figure out where the top-most shape starts and the bottom-most shape ends, relative to the y midline int minTop = Integer.MAX_VALUE; int maxBottom = Integer.MIN_VALUE; List<Integer> rowYMidPoints = new ArrayList<Integer>(shapes.size()); for (Shape shape : shapes) { double shapeMidPointX = (double) (shape.getLeft() + shape.getRight()) / 2.0; int shapeMidPointY = (int) Math .round(rowMidPointY + (meanHorizontalSlope * (shapeMidPointX - rowMidPointX))); rowYMidPoints.add(shapeMidPointY); int relativeTop = shape.getTop() - shapeMidPointY; int relativeBottom = shape.getBottom() - shapeMidPointY; if (relativeTop < minTop) minTop = relativeTop; if (relativeBottom > maxBottom) maxBottom = relativeBottom; } if (minTop > 0) minTop = 0; if (maxBottom < 0) maxBottom = 0; int yIntervalTop = 0 - minTop; int yIntervalBottom = maxBottom; int yInterval = yIntervalTop + 1 + yIntervalBottom; LOG.debug("yIntervalTop: " + yIntervalTop); LOG.debug("yIntervalBottom: " + yIntervalBottom); LOG.debug("yInterval: " + yInterval); int[] pixelCounts = new int[yInterval]; // Get the pixel count for each row // examining one shape at a time to limit ourselves to the pixels that are // actually considered to be in this row int blackThreshold = this.getContainer().getSeparationThreshold(); int shapeIndex = 0; int shapeCount = 0; for (Shape shape : shapes) { if (shape.getHeight() >= minHeight) { LOG.trace(shape.toString()); shapeCount++; int shapeMidPointY = rowYMidPoints.get(shapeIndex); int zeroLine = shapeMidPointY - yIntervalTop; int topIndex = shape.getTop() - zeroLine; for (int x = 0; x < shape.getWidth(); x++) { for (int y = 0; y < shape.getHeight(); y++) { int yIndex = topIndex + y; if (yIndex >= 0 && yIndex < pixelCounts.length && shape.isPixelBlack(x, y, blackThreshold)) { pixelCounts[yIndex]++; } } } } shapeIndex++; } LOG.debug("Got pixels from " + shapeCount + " shapes."); boolean notEnoughShapes = shapeCount < 3; LOG.debug("notEnoughShapes? " + notEnoughShapes); // We start at the top // As soon as we reach a line with more pixels than the mean, we assume this is the mean-line Mean pixelCountMeanTop = new Mean(); StandardDeviation pixelCountStdDevTop = new StandardDeviation(); for (i = 0; i <= yIntervalTop; i++) { pixelCountMeanTop.increment(pixelCounts[i]); pixelCountStdDevTop.increment(pixelCounts[i]); } LOG.debug("Top: pixel count mean: " + pixelCountMeanTop.getResult() + ", std dev: " + pixelCountStdDevTop.getResult()); double threshold = pixelCountMeanTop.getResult() * 1.1; if (notEnoughShapes) { threshold = threshold / 2.0; } double lowerThreshold = threshold / 2.0; LOG.debug("Top threshold: " + threshold); LOG.debug("Top lowerThreshold: " + lowerThreshold); int meanLine = 0; boolean findMeanLine = true; for (i = 0; i <= yIntervalTop; i++) { int pixelCount = pixelCounts[i]; if (findMeanLine && pixelCount > threshold) { meanLine = i; findMeanLine = false; } else if (!findMeanLine && pixelCount < lowerThreshold) { findMeanLine = true; } } // We start at the bottom // As soon as we reach a line with more pixels than the mean, we assume this is the base-line Mean pixelCountMeanBottom = new Mean(); StandardDeviation pixelCountStdDevBottom = new StandardDeviation(); for (i = pixelCounts.length - 1; i >= yIntervalTop; i--) { pixelCountMeanBottom.increment(pixelCounts[i]); pixelCountStdDevBottom.increment(pixelCounts[i]); } LOG.debug("Bottom: pixel count mean: " + pixelCountMeanBottom.getResult() + ", std dev: " + pixelCountStdDevBottom.getResult()); threshold = pixelCountMeanBottom.getResult() * 1.1; if (notEnoughShapes) { threshold = threshold / 2.0; } lowerThreshold = threshold / 2.0; LOG.debug("Bottom threshold: " + threshold); LOG.debug("Bottom lowerThreshold: " + lowerThreshold); int baseLine = meanLine; boolean findBaseLine = true; for (i = pixelCounts.length - 1; i >= yIntervalTop; i--) { int pixelCount = pixelCounts[i]; if (findBaseLine && pixelCount > threshold) { baseLine = i; findBaseLine = false; } else if (!findBaseLine && pixelCount < lowerThreshold) { findBaseLine = true; } } for (i = 0; i < yInterval; i++) { int pixelCount = pixelCounts[i]; if (i == meanLine) LOG.trace("======= MEAN LINE " + i + " =========="); LOG.trace("pixel row " + i + ". pixel count " + pixelCount); if (i == baseLine) LOG.trace("======= BASE LINE " + i + " =========="); } // assign base lines and mean lines to each shape shapeIndex = 0; for (Shape shape : shapes) { int shapeMidPointY = rowYMidPoints.get(shapeIndex); int yMeanline = (shapeMidPointY - yIntervalTop) + meanLine; int yBaseline = (shapeMidPointY - yIntervalTop) + baseLine; LOG.trace(shape.toString() + ", meanLine: " + (yMeanline - shape.getTop()) + ", baseLine: " + (yBaseline - shape.getTop())); shape.setBaseLine(yBaseline - shape.getTop()); shape.setMeanLine(yMeanline - shape.getTop()); shapeIndex++; } // next shape int xHeight = baseLine - meanLine; return xHeight; }
From source file:com.joliciel.talismane.stats.FScoreCalculator.java
/** * Combine the results of n cross validation results into a single f-score file. * @param directory//from ww w . j av a2 s . c om * @param prefix * @param suffix * @param csvFileWriter */ static void combineCrossValidationResults(File directory, String prefix, String suffix, Writer csvFileWriter) { try { File[] files = directory.listFiles(); Map<Integer, Map<String, FScoreStats>> fileStatsMap = new HashMap<Integer, Map<String, FScoreStats>>(); for (File file : files) { if (file.getName().startsWith(prefix) && file.getName().endsWith(suffix)) { int index = Integer.parseInt(file.getName().substring(prefix.length(), prefix.length() + 1)); Map<String, FScoreStats> statsMap = new HashMap<String, FScoreCalculator.FScoreStats>(); fileStatsMap.put(index, statsMap); Scanner scanner = new Scanner( new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"))); boolean firstLine = true; int truePositivePos = -1; while (scanner.hasNextLine()) { String line = scanner.nextLine(); List<String> cells = CSV.getCSVCells(line); if (firstLine) { int i = 0; for (String cell : cells) { if (cell.equals("true+")) { truePositivePos = i; break; } i++; } if (truePositivePos < 0) { throw new JolicielException("Couldn't find true+ on first line"); } firstLine = false; } else { FScoreStats stats = new FScoreStats(); String outcome = cells.get(0); stats.outcome = outcome; if (outcome.equals("AVERAGE")) break; stats.truePos = Integer.parseInt(cells.get(truePositivePos)); stats.falsePos = Integer.parseInt(cells.get(truePositivePos + 1)); stats.falseNeg = Integer.parseInt(cells.get(truePositivePos + 2)); stats.precision = Double.parseDouble(cells.get(truePositivePos + 3)); stats.recall = Double.parseDouble(cells.get(truePositivePos + 4)); stats.fScore = Double.parseDouble(cells.get(truePositivePos + 5)); statsMap.put(outcome, stats); } // firstLine? } // has more lines scanner.close(); } // file in current series } // next file int numFiles = fileStatsMap.size(); if (numFiles == 0) { throw new JolicielException("No files found matching prefix and suffix provided"); } Map<String, DescriptiveStatistics> descriptiveStatsMap = new HashMap<String, DescriptiveStatistics>(); Map<String, FScoreStats> outcomeStats = new HashMap<String, FScoreCalculator.FScoreStats>(); Set<String> outcomes = new TreeSet<String>(); for (Map<String, FScoreStats> statsMap : fileStatsMap.values()) { for (FScoreStats stats : statsMap.values()) { DescriptiveStatistics fScoreStats = descriptiveStatsMap.get(stats.outcome + "fScore"); if (fScoreStats == null) { fScoreStats = new DescriptiveStatistics(); descriptiveStatsMap.put(stats.outcome + "fScore", fScoreStats); } fScoreStats.addValue(stats.fScore); DescriptiveStatistics precisionStats = descriptiveStatsMap.get(stats.outcome + "precision"); if (precisionStats == null) { precisionStats = new DescriptiveStatistics(); descriptiveStatsMap.put(stats.outcome + "precision", precisionStats); } precisionStats.addValue(stats.precision); DescriptiveStatistics recallStats = descriptiveStatsMap.get(stats.outcome + "recall"); if (recallStats == null) { recallStats = new DescriptiveStatistics(); descriptiveStatsMap.put(stats.outcome + "recall", recallStats); } recallStats.addValue(stats.recall); FScoreStats outcomeStat = outcomeStats.get(stats.outcome); if (outcomeStat == null) { outcomeStat = new FScoreStats(); outcomeStat.outcome = stats.outcome; outcomeStats.put(stats.outcome, outcomeStat); } outcomeStat.truePos += stats.truePos; outcomeStat.falsePos += stats.falsePos; outcomeStat.falseNeg += stats.falseNeg; outcomes.add(stats.outcome); } } csvFileWriter.write(CSV.format(prefix + suffix)); csvFileWriter.write("\n"); csvFileWriter.write(CSV.format("outcome")); csvFileWriter.write(CSV.format("true+") + CSV.format("false+") + CSV.format("false-") + CSV.format("tot precision") + CSV.format("avg precision") + CSV.format("dev precision") + CSV.format("tot recall") + CSV.format("avg recall") + CSV.format("dev recall") + CSV.format("tot f-score") + CSV.format("avg f-score") + CSV.format("dev f-score") + "\n"); for (String outcome : outcomes) { csvFileWriter.write(CSV.format(outcome)); FScoreStats outcomeStat = outcomeStats.get(outcome); DescriptiveStatistics fScoreStats = descriptiveStatsMap.get(outcome + "fScore"); DescriptiveStatistics precisionStats = descriptiveStatsMap.get(outcome + "precision"); DescriptiveStatistics recallStats = descriptiveStatsMap.get(outcome + "recall"); outcomeStat.calculate(); csvFileWriter.write(CSV.format(outcomeStat.truePos)); csvFileWriter.write(CSV.format(outcomeStat.falsePos)); csvFileWriter.write(CSV.format(outcomeStat.falseNeg)); csvFileWriter.write(CSV.format(outcomeStat.precision * 100)); csvFileWriter.write(CSV.format(precisionStats.getMean())); csvFileWriter.write(CSV.format(precisionStats.getStandardDeviation())); csvFileWriter.write(CSV.format(outcomeStat.recall * 100)); csvFileWriter.write(CSV.format(recallStats.getMean())); csvFileWriter.write(CSV.format(recallStats.getStandardDeviation())); csvFileWriter.write(CSV.format(outcomeStat.fScore * 100)); csvFileWriter.write(CSV.format(fScoreStats.getMean())); csvFileWriter.write(CSV.format(fScoreStats.getStandardDeviation())); csvFileWriter.write("\n"); csvFileWriter.flush(); } } catch (IOException ioe) { throw new RuntimeException(ioe); } }
From source file:com.joliciel.jochre.graphics.SourceImageImpl.java
public List<Rectangle> findColumnSeparators() { if (columnSeparators == null) { LOG.debug("############ findColumnSeparators ##############"); double slope = this.getMeanHorizontalSlope(); double imageMidPointX = (double) this.getWidth() / 2.0; int[] horizontalCounts = new int[this.getHeight()]; DescriptiveStatistics rowXHeightStats = new DescriptiveStatistics(); // first get the fill factor for each horizontal row in the image for (RowOfShapes row : this.getRows()) { rowXHeightStats.addValue(row.getXHeight()); for (Shape shape : row.getShapes()) { double shapeMidPointX = (double) (shape.getLeft() + shape.getRight()) / 2.0; int slopeAdjustedTop = (int) Math .round(shape.getTop() + (slope * (shapeMidPointX - imageMidPointX))); if (slopeAdjustedTop >= 0 && slopeAdjustedTop < this.getHeight()) { for (int i = 0; i < shape.getHeight(); i++) { if (slopeAdjustedTop + i < horizontalCounts.length) horizontalCounts[slopeAdjustedTop + i] += shape.getWidth(); }/*from w w w .j a v a 2 s . c om*/ } } } DescriptiveStatistics horizontalStats = new DescriptiveStatistics(); DescriptiveStatistics horizontalStatsNonEmpty = new DescriptiveStatistics(); for (int i = 0; i < this.getHeight(); i++) { // LOG.trace("Row " + i + ": " + horizontalCounts[i]); horizontalStats.addValue(horizontalCounts[i]); if (horizontalCounts[i] > 0) horizontalStatsNonEmpty.addValue(horizontalCounts[i]); } LOG.debug("Mean horizontal count: " + horizontalStats.getMean()); LOG.debug("Median horizontal count: " + horizontalStats.getPercentile(50)); LOG.debug("25 percentile horizontal count: " + horizontalStats.getPercentile(25)); LOG.debug("Mean horizontal count (non empty): " + horizontalStatsNonEmpty.getMean()); LOG.debug("Median horizontal count (non empty): " + horizontalStatsNonEmpty.getPercentile(50)); LOG.debug("25 percentile horizontal count (non empty): " + horizontalStatsNonEmpty.getPercentile(25)); LOG.debug("10 percentile horizontal count (non empty): " + horizontalStatsNonEmpty.getPercentile(10)); double maxEmptyRowCount = horizontalStatsNonEmpty.getMean() / 8.0; LOG.debug("maxEmptyRowCount: " + maxEmptyRowCount); boolean inEmptyHorizontalRange = false; List<int[]> emptyHorizontalRanges = new ArrayList<int[]>(); int emptyHorizontalRangeStart = 0; for (int i = 0; i < this.getHeight(); i++) { if (!inEmptyHorizontalRange && horizontalCounts[i] <= maxEmptyRowCount) { inEmptyHorizontalRange = true; emptyHorizontalRangeStart = i; } else if (inEmptyHorizontalRange && horizontalCounts[i] > maxEmptyRowCount) { inEmptyHorizontalRange = false; emptyHorizontalRanges.add(new int[] { emptyHorizontalRangeStart, i }); } } if (inEmptyHorizontalRange) { emptyHorizontalRanges.add(new int[] { emptyHorizontalRangeStart, this.getHeight() - 1 }); } LOG.debug("rowXHeight mean: " + rowXHeightStats.getMean()); LOG.debug("rowXHeight median: " + rowXHeightStats.getPercentile(50)); double minHorizontalBreak = rowXHeightStats.getMean() * 2.0; LOG.debug("minHorizontalBreak: " + minHorizontalBreak); int smallBreakCount = 0; int mainTextTop = 0; int bigBreakCount = 0; for (int[] emptyHorizontalRange : emptyHorizontalRanges) { int height = emptyHorizontalRange[1] - emptyHorizontalRange[0]; LOG.trace("empty range: " + emptyHorizontalRange[0] + ", " + emptyHorizontalRange[1] + " = " + height); if (bigBreakCount < 2 && smallBreakCount < 2 && height > minHorizontalBreak) { mainTextTop = emptyHorizontalRange[1]; bigBreakCount++; } if (height <= minHorizontalBreak) smallBreakCount++; } LOG.debug("mainTextTop:" + mainTextTop); // lift mainTextTop upwards by max an x-height or till we reach a zero row int minTop = mainTextTop - (int) (rowXHeightStats.getMean() / 2.0); if (minTop < 0) minTop = 0; for (int i = mainTextTop; i > minTop; i--) { mainTextTop = i; if (horizontalCounts[i] == 0) { break; } } LOG.debug("mainTextTop (adjusted):" + mainTextTop); smallBreakCount = 0; bigBreakCount = 0; int mainTextBottom = this.getHeight(); for (int i = emptyHorizontalRanges.size() - 1; i >= 0; i--) { int[] emptyHorizontalRange = emptyHorizontalRanges.get(i); int height = emptyHorizontalRange[1] - emptyHorizontalRange[0]; LOG.trace("emptyHorizontalRange: " + emptyHorizontalRange[0] + ", height: " + height + ", bigBreakCount: " + bigBreakCount + ", smallBreakCount: " + smallBreakCount); if ((bigBreakCount + smallBreakCount) <= 2 && height > minHorizontalBreak) { mainTextBottom = emptyHorizontalRange[0]; LOG.trace("Set mainTextBottom to " + mainTextBottom); bigBreakCount++; } if (height <= minHorizontalBreak) smallBreakCount++; if ((bigBreakCount + smallBreakCount) > 2) break; } LOG.debug("mainTextBottom:" + mainTextBottom); // lower mainTextBottom downwards by max an x-height or till we reach a zero row int maxBottom = mainTextBottom + (int) (rowXHeightStats.getMean() / 2.0); if (maxBottom > this.getHeight()) maxBottom = this.getHeight(); for (int i = mainTextBottom; i < maxBottom; i++) { mainTextBottom = i; if (horizontalCounts[i] == 0) { break; } } LOG.debug("mainTextBottom (adjusted):" + mainTextBottom); int[] verticalCounts = new int[this.getWidth()]; // first get the fill factor for each horizontal row in the image for (RowOfShapes row : this.getRows()) { for (Shape shape : row.getShapes()) { int slopeAdjustedLeft = (int) Math.round(shape.getLeft() - row.getXAdjustment()); double shapeMidPointX = (double) (shape.getLeft() + shape.getRight()) / 2.0; int slopeAdjustedTop = (int) Math .round(shape.getTop() + (slope * (shapeMidPointX - imageMidPointX))); if (slopeAdjustedTop >= mainTextTop && slopeAdjustedTop <= mainTextBottom && slopeAdjustedLeft >= 0 && slopeAdjustedLeft < this.getWidth()) { for (int i = 0; i < shape.getWidth(); i++) { if (slopeAdjustedLeft + i < this.getWidth()) verticalCounts[slopeAdjustedLeft + i] += shape.getHeight(); } } } } DescriptiveStatistics verticalStats = new DescriptiveStatistics(); DescriptiveStatistics verticalStatsNonEmpty = new DescriptiveStatistics(); for (int i = 0; i < this.getWidth(); i++) { // LOG.trace("Column " + i + ": " + verticalCounts[i]); verticalStats.addValue(verticalCounts[i]); if (verticalCounts[i] > 0) verticalStatsNonEmpty.addValue(verticalCounts[i]); } LOG.debug("Mean vertical count: " + verticalStats.getMean()); LOG.debug("Median vertical count: " + verticalStats.getPercentile(50)); LOG.debug("25 percentile vertical count: " + verticalStats.getPercentile(25)); LOG.debug("Mean vertical count (non empty): " + verticalStatsNonEmpty.getMean()); LOG.debug("Median vertical count (non empty): " + verticalStatsNonEmpty.getPercentile(50)); LOG.debug("25 percentile vertical count (non empty): " + verticalStatsNonEmpty.getPercentile(25)); LOG.debug("10 percentile vertical count (non empty): " + verticalStatsNonEmpty.getPercentile(10)); LOG.debug("1 percentile vertical count (non empty): " + verticalStatsNonEmpty.getPercentile(1)); // double maxEmptyColumnCount = verticalStatsNonEmpty.getMean() / 8.0; double maxEmptyColumnCount = verticalStatsNonEmpty.getPercentile(1); LOG.debug("maxEmptyColumnCount: " + maxEmptyColumnCount); boolean inEmptyVerticalRange = false; List<int[]> emptyVerticalRanges = new ArrayList<int[]>(); int emptyVerticalRangeStart = 0; for (int i = 0; i < this.getWidth(); i++) { if (!inEmptyVerticalRange && verticalCounts[i] <= maxEmptyColumnCount) { inEmptyVerticalRange = true; emptyVerticalRangeStart = i; } else if (inEmptyVerticalRange && verticalCounts[i] > maxEmptyColumnCount) { inEmptyVerticalRange = false; emptyVerticalRanges.add(new int[] { emptyVerticalRangeStart, i }); } } if (inEmptyVerticalRange) { emptyVerticalRanges.add(new int[] { emptyVerticalRangeStart, this.getWidth() - 1 }); } LOG.debug("rowXHeight mean: " + rowXHeightStats.getMean()); LOG.debug("rowXHeight median: " + rowXHeightStats.getPercentile(50)); double minVerticalBreak = rowXHeightStats.getMean() * 1.0; LOG.debug("minVerticalBreak: " + minVerticalBreak); List<int[]> columnBreaks = new ArrayList<int[]>(); for (int[] emptyVerticalRange : emptyVerticalRanges) { int width = emptyVerticalRange[1] - emptyVerticalRange[0]; LOG.trace("empty range: " + emptyVerticalRange[0] + ", " + emptyVerticalRange[1] + " = " + width); if (width >= minVerticalBreak) { columnBreaks.add(emptyVerticalRange); LOG.trace("Found column break!"); } } columnSeparators = new ArrayList<Rectangle>(); for (int[] columnBreak : columnBreaks) { // reduce the column break to the thickest empty area if possible int[] bestColumnBreak = null; double originalCount = maxEmptyColumnCount; maxEmptyColumnCount = 0; while (bestColumnBreak == null && maxEmptyColumnCount <= originalCount) { inEmptyVerticalRange = false; emptyVerticalRanges = new ArrayList<int[]>(); emptyVerticalRangeStart = columnBreak[0]; for (int i = columnBreak[0]; i <= columnBreak[1]; i++) { if (!inEmptyVerticalRange && verticalCounts[i] <= maxEmptyColumnCount) { inEmptyVerticalRange = true; emptyVerticalRangeStart = i; } else if (inEmptyVerticalRange && verticalCounts[i] > maxEmptyColumnCount) { inEmptyVerticalRange = false; emptyVerticalRanges.add(new int[] { emptyVerticalRangeStart, i }); } } if (inEmptyVerticalRange) { emptyVerticalRanges.add(new int[] { emptyVerticalRangeStart, columnBreak[1] }); } for (int[] emptyVerticalRange : emptyVerticalRanges) { if (bestColumnBreak == null || (emptyVerticalRange[1] - emptyVerticalRange[0] > bestColumnBreak[1] - bestColumnBreak[0])) bestColumnBreak = emptyVerticalRange; } maxEmptyColumnCount += (originalCount / 8.0); } if (bestColumnBreak == null) bestColumnBreak = columnBreak; Rectangle whiteArea = new WhiteArea(bestColumnBreak[0], mainTextTop, bestColumnBreak[1], mainTextBottom); columnSeparators.add(whiteArea); LOG.debug("ColumnBreak: " + whiteArea); } // next column break } return columnSeparators; }