List of usage examples for org.apache.commons.math.stat Frequency getCount
public long getCount(char v)
From source file:com.gtwm.pb.model.manageData.WordCloud.java
/** * @param textLowerCase//from w w w .ja va 2s .c om * Input text, must be lower case * @param minWeight * Minimum tag weight, e.g. a font size * @param maxWeight * Max. tag weight * @param maxTags * Maximum number of tags to return, -1 for all tags * @param additionalStopWords * Set of words to specifically exclude, in addition to the * standard set [and, not, after, yes, no, ...] */ public WordCloud(String textLowerCase, int minWeight, int maxWeight, int maxTags, Set<String> additionalStopWords) { String[] wordArray = textLowerCase.split("\\W"); Set<String> stopWords = new HashSet<String>(Arrays.asList(stopWordsArray)); for (String additionalStopWord : additionalStopWords) { stopWords.add(additionalStopWord.toLowerCase().trim()); } LancasterStemmer stemmer = new LancasterStemmer(); String wordStem; Frequency frequencies = new Frequency(); for (String wordString : wordArray) { if ((!stopWords.contains(wordString)) && (wordString.length() >= minWordLength)) { wordStem = stemmer.stripSuffixes(wordString); // Record the mapping of the stem to its origin so the most // common origin can be re-introduced when the cloud is // generated this.recordStemOrigin(wordString, wordStem); frequencies.addValue(wordStem); } } // Compute std. dev of frequencies so we can remove outliers DescriptiveStatistics stats = new DescriptiveStatistics(); Iterator freqIt = frequencies.valuesIterator(); long stemFreq; while (freqIt.hasNext()) { stemFreq = frequencies.getCount(freqIt.next()); stats.addValue(stemFreq); } double mean = stats.getMean(); double stdDev = stats.getStandardDeviation(); long minFreq = Long.MAX_VALUE; long maxFreq = 0; // Remove outliers freqIt = frequencies.valuesIterator(); int upperLimit = (int) (mean + (stdDev * 10)); int lowerLimit = (int) (mean - stdDev); if (lowerLimit < 2) { lowerLimit = 2; } int numWords = 0; int numRawWords = wordArray.length; boolean removeLowOutliers = (numRawWords > (maxTags * 10)); while (freqIt.hasNext()) { wordStem = (String) freqIt.next(); stemFreq = frequencies.getCount(wordStem); // For a large input set, remove high and low outliers. // For a smaller set, just high freq. outliers if ((stemFreq > upperLimit) || ((stemFreq < lowerLimit) && removeLowOutliers)) { freqIt.remove(); } else { numWords++; if (stemFreq > maxFreq) { maxFreq = stemFreq; } else if (stemFreq < minFreq) { minFreq = stemFreq; } } } // Cut down to exact required number of tags by removing smallest if (lowerLimit < minFreq) { lowerLimit = (int) minFreq; } if (numWords > maxTags) { while (numWords > maxTags) { freqIt = frequencies.valuesIterator(); SMALLREMOVAL: while (freqIt.hasNext()) { stemFreq = frequencies.getCount(freqIt.next()); if (stemFreq < lowerLimit) { freqIt.remove(); numWords--; if (numWords == maxTags) { break SMALLREMOVAL; } } } int step = (int) ((mean - lowerLimit) / 3); if (step < 1) { step = 1; } lowerLimit += step; } // The new min. freq. may have changed minFreq = Long.MAX_VALUE; freqIt = frequencies.valuesIterator(); while (freqIt.hasNext()) { stemFreq = frequencies.getCount(freqIt.next()); if (stemFreq < minFreq) { minFreq = stemFreq; } } } // Scale and create tag objects double scaleFactor; if (maxFreq == minFreq) { scaleFactor = (double) (maxWeight - minWeight) / 4; // TODO: a realistic // scale factor in this // case } else { scaleFactor = (double) (maxWeight - minWeight) / (maxFreq - minFreq); } freqIt = frequencies.valuesIterator(); int weight; while (freqIt.hasNext()) { wordStem = (String) freqIt.next(); stemFreq = frequencies.getCount(wordStem); // Might still be some left less than the min. threshold if (stemFreq <= minFreq) { weight = minWeight; } else { weight = (int) (Math.ceil((double) (stemFreq - minFreq) * scaleFactor) + minWeight); } SortedSet<WordInfo> origins = this.stemOriginMap.get(wordStem); String mostCommonOrigin = origins.last().getName(); Set<String> synonyms = new TreeSet<String>(); for (WordInfo origin : origins) { synonyms.add(origin.getName()); } WordInfo word = new Word(mostCommonOrigin, weight, synonyms); this.words.add(word); } }
From source file:geogebra.kernel.AlgoFrequency.java
protected final void compute() { // Validate input arguments //======================================================= if (!dataList.isDefined() || dataList.size() == 0) { frequency.setUndefined();//from w ww.j a va2 s. c om return; } if (!(dataList.getElementType() == GeoElement.GEO_CLASS_TEXT || dataList.getElementType() == GeoElement.GEO_CLASS_NUMERIC)) { frequency.setUndefined(); return; } if (classList != null) { if (classList.getElementType() != GeoElement.GEO_CLASS_NUMERIC || classList.size() < 2) { frequency.setUndefined(); return; } } if (density != null) { if (density.getDouble() <= 0) { frequency.setUndefined(); return; } } frequency.setDefined(true); frequency.clear(); if (value != null) value.clear(); double numMax = 0, numMin = 0; boolean doCumulative = isCumulative != null && isCumulative.getBoolean(); // Load the data into f, an instance of Frequency class //======================================================= Frequency f = new Frequency(); for (int i = 0; i < dataList.size(); i++) { if (dataList.getElementType() == GeoElement.GEO_CLASS_TEXT) f.addValue(((GeoText) dataList.get(i)).toValueString()); if (dataList.getElementType() == GeoElement.GEO_CLASS_NUMERIC) f.addValue(((GeoNumeric) dataList.get(i)).getDouble()); } // If classList does not exist, // get the unique value list and compute frequencies for this list //======================================================= // handle string data if (dataList.getElementType() == GeoElement.GEO_CLASS_TEXT) { Iterator itr = f.valuesIterator(); String strMax = (String) itr.next(); String strMin = strMax; itr = f.valuesIterator(); while (itr.hasNext()) { String s = (String) itr.next(); if (s.compareTo(strMax) > 0) strMax = s; if (s.compareTo(strMin) < 0) strMin = s; GeoText text = new GeoText(cons); text.setTextString(s); value.add(text); if (classList == null) if (doCumulative) frequency.add(new GeoNumeric(cons, f.getCumFreq((Comparable) s))); else frequency.add(new GeoNumeric(cons, f.getCount((Comparable) s))); } } // handle numeric data else { Iterator itr = f.valuesIterator(); numMax = (Double) itr.next(); numMin = numMax; itr = f.valuesIterator(); while (itr.hasNext()) { Double n = (Double) itr.next(); if (n > numMax) numMax = n.doubleValue(); if (n < numMin) numMin = n.doubleValue(); value.add(new GeoNumeric(cons, n)); if (classList == null) if (doCumulative) frequency.add(new GeoNumeric(cons, f.getCumFreq((Comparable) n))); else frequency.add(new GeoNumeric(cons, f.getCount((Comparable) n))); } } // If classList exists, compute frequencies using the classList //======================================================= if (classList != null) { double lowerClassBound = 0; double upperClassBound = 0; double classFreq = 0; //set density conditions boolean hasDensity = false; if (useDensity != null) hasDensity = useDensity.getBoolean(); double densityValue = 1; // default density if (density != null) { densityValue = density.getDouble(); } double cumulativeClassFreq = 0; double swap; int length = classList.size(); for (int i = 1; i < length; i++) { lowerClassBound = ((GeoNumeric) classList.get(i - 1)).getDouble(); upperClassBound = ((GeoNumeric) classList.get(i)).getDouble(); boolean increasing = true; if (lowerClassBound > upperClassBound) { swap = upperClassBound; upperClassBound = lowerClassBound; lowerClassBound = swap; increasing = false; } classFreq = f.getCumFreq((Comparable) upperClassBound) - f.getCumFreq((Comparable) lowerClassBound) + f.getCount((Comparable) lowerClassBound); if ((i != length - 1 && increasing) || (i != 1 && !increasing)) classFreq -= f.getCount((Comparable) upperClassBound); // System.out.println(" ================================="); // System.out.println("class freq: " + classFreq + " " + density); if (hasDensity) { classFreq = densityValue * classFreq / (upperClassBound - lowerClassBound); } if (doCumulative) cumulativeClassFreq += classFreq; // System.out.println("class freq: " + classFreq); // add the frequency to the output GeoList frequency.add(new GeoNumeric(cons, doCumulative ? cumulativeClassFreq : classFreq)); } // handle the last (highest) class frequency specially // it must also count values equal to the highest class bound } }
From source file:geogebra.common.kernel.statistics.AlgoFrequency.java
@Override public final void compute() { if (isContingencyTable) { computeContingencyTable();//from w w w .j a v a2 s . c om return; } // Validate input arguments // ======================================================= if (!dataList.isDefined() || dataList.size() == 0) { frequency.setUndefined(); return; } if (!(dataList.getElementType().equals(GeoClass.TEXT) || dataList.getElementType().equals(GeoClass.NUMERIC))) { frequency.setUndefined(); return; } if (classList != null) { if (!classList.getElementType().equals(GeoClass.NUMERIC) || classList.size() < 2) { frequency.setUndefined(); return; } } if (density != null) { if (density.getDouble() <= 0) { frequency.setUndefined(); return; } } if (scale != null) { if (!scale.isDefined()) { frequency.setUndefined(); return; } scaleFactor = scale.getValue(); } frequency.setDefined(true); frequency.clear(); if (value != null) value.clear(); double numMax = 0, numMin = 0; boolean doCumulative = isCumulative != null && isCumulative.getBoolean(); // Load the data into f, an instance of Frequency class // ======================================================= Frequency f = new FrequencyGgb(); for (int i = 0; i < dataList.size(); i++) { if (dataList.getElementType().equals(GeoClass.TEXT)) f.addValue(((GeoText) dataList.get(i)).toValueString(StringTemplate.defaultTemplate)); if (dataList.getElementType().equals(GeoClass.NUMERIC)) f.addValue(((GeoNumeric) dataList.get(i)).getDouble()); } // If classList does not exist, // get the unique value list and compute frequencies for this list // ======================================================= // handle string data if (dataList.getElementType().equals(GeoClass.TEXT)) { Iterator<Comparable<?>> itr = f.valuesIterator(); String strMax = (String) itr.next(); String strMin = strMax; itr = f.valuesIterator(); while (itr.hasNext()) { String s = (String) itr.next(); if (s.compareTo(strMax) > 0) strMax = s; if (s.compareTo(strMin) < 0) strMin = s; GeoText text = new GeoText(cons); text.setTextString(s); value.add(text); if (classList == null) { if (doCumulative) { addValue(f.getCumFreq(s)); } else { addValue(f.getCount(s)); } } } } // handle numeric data else { Iterator<Comparable<?>> itr = f.valuesIterator(); numMax = (Double) itr.next(); numMin = numMax; itr = f.valuesIterator(); while (itr.hasNext()) { Double n = (Double) itr.next(); if (n > numMax) numMax = n.doubleValue(); if (n < numMin) numMin = n.doubleValue(); value.add(new GeoNumeric(cons, n)); if (classList == null) if (doCumulative) addValue(f.getCumFreq(n)); else addValue(f.getCount(n)); } } // If classList exists, compute frequencies using the classList // ======================================================= if (classList != null) { double lowerClassBound = 0; double upperClassBound = 0; double classFreq = 0; // set density conditions boolean hasDensity = false; if (useDensity != null) hasDensity = useDensity.getBoolean(); double densityValue = 1; // default density if (density != null) { densityValue = density.getDouble(); } double cumulativeClassFreq = 0; double swap; int length = classList.size(); for (int i = 1; i < length; i++) { lowerClassBound = ((GeoNumeric) classList.get(i - 1)).getDouble(); upperClassBound = ((GeoNumeric) classList.get(i)).getDouble(); // handle roundoff errror in class list values (this is possible // if auto-generated by another cmd) lowerClassBound = Kernel.checkDecimalFraction(lowerClassBound); upperClassBound = Kernel.checkDecimalFraction(upperClassBound); boolean increasing = true; if (lowerClassBound > upperClassBound) { swap = upperClassBound; upperClassBound = lowerClassBound; lowerClassBound = swap; increasing = false; } classFreq = f.getCumFreq(upperClassBound) - f.getCumFreq(lowerClassBound) + f.getCount(lowerClassBound); if ((i != length - 1 && increasing) || (i != 1 && !increasing)) classFreq -= f.getCount(upperClassBound); // System.out.println(" ================================="); // System.out.println("class freq: " + classFreq + " " + // density); if (doCumulative) cumulativeClassFreq += classFreq; // adjust the frequency and add to the output GeoList double v = doCumulative ? cumulativeClassFreq : classFreq; if (hasDensity) { v = densityValue * v / (upperClassBound - lowerClassBound); } addValue(v); } // handle the last (highest) class frequency specially // it must also count values equal to the highest class bound } }