Example usage for org.apache.commons.math.stat Frequency valuesIterator

List of usage examples for org.apache.commons.math.stat Frequency valuesIterator

Introduction

In this page you can find the example usage for org.apache.commons.math.stat Frequency valuesIterator.

Prototype

public Iterator<Comparable<?>> valuesIterator() 

Source Link

Document

Returns an Iterator over the set of values that have been added.

Usage

From source file:com.gtwm.pb.model.manageData.WordCloud.java

/**
 * @param textLowerCase/*from w w  w  .j av a 2s .  c om*/
 *            Input text, must be lower case
 * @param minWeight
 *            Minimum tag weight, e.g. a font size
 * @param maxWeight
 *            Max. tag weight
 * @param maxTags
 *            Maximum number of tags to return, -1 for all tags
 * @param additionalStopWords
 *            Set of words to specifically exclude, in addition to the
 *            standard set [and, not, after, yes, no, ...]
 */
public WordCloud(String textLowerCase, int minWeight, int maxWeight, int maxTags,
        Set<String> additionalStopWords) {
    String[] wordArray = textLowerCase.split("\\W");
    Set<String> stopWords = new HashSet<String>(Arrays.asList(stopWordsArray));
    for (String additionalStopWord : additionalStopWords) {
        stopWords.add(additionalStopWord.toLowerCase().trim());
    }
    LancasterStemmer stemmer = new LancasterStemmer();
    String wordStem;
    Frequency frequencies = new Frequency();
    for (String wordString : wordArray) {
        if ((!stopWords.contains(wordString)) && (wordString.length() >= minWordLength)) {
            wordStem = stemmer.stripSuffixes(wordString);
            // Record the mapping of the stem to its origin so the most
            // common origin can be re-introduced when the cloud is
            // generated
            this.recordStemOrigin(wordString, wordStem);
            frequencies.addValue(wordStem);
        }
    }
    // Compute std. dev of frequencies so we can remove outliers
    DescriptiveStatistics stats = new DescriptiveStatistics();
    Iterator freqIt = frequencies.valuesIterator();
    long stemFreq;
    while (freqIt.hasNext()) {
        stemFreq = frequencies.getCount(freqIt.next());
        stats.addValue(stemFreq);
    }
    double mean = stats.getMean();
    double stdDev = stats.getStandardDeviation();
    long minFreq = Long.MAX_VALUE;
    long maxFreq = 0;
    // Remove outliers
    freqIt = frequencies.valuesIterator();
    int upperLimit = (int) (mean + (stdDev * 10));
    int lowerLimit = (int) (mean - stdDev);
    if (lowerLimit < 2) {
        lowerLimit = 2;
    }
    int numWords = 0;
    int numRawWords = wordArray.length;
    boolean removeLowOutliers = (numRawWords > (maxTags * 10));
    while (freqIt.hasNext()) {
        wordStem = (String) freqIt.next();
        stemFreq = frequencies.getCount(wordStem);
        // For a large input set, remove high and low outliers.
        // For a smaller set, just high freq. outliers
        if ((stemFreq > upperLimit) || ((stemFreq < lowerLimit) && removeLowOutliers)) {
            freqIt.remove();
        } else {
            numWords++;
            if (stemFreq > maxFreq) {
                maxFreq = stemFreq;
            } else if (stemFreq < minFreq) {
                minFreq = stemFreq;
            }
        }
    }
    // Cut down to exact required number of tags by removing smallest
    if (lowerLimit < minFreq) {
        lowerLimit = (int) minFreq;
    }
    if (numWords > maxTags) {
        while (numWords > maxTags) {
            freqIt = frequencies.valuesIterator();
            SMALLREMOVAL: while (freqIt.hasNext()) {
                stemFreq = frequencies.getCount(freqIt.next());
                if (stemFreq < lowerLimit) {
                    freqIt.remove();
                    numWords--;
                    if (numWords == maxTags) {
                        break SMALLREMOVAL;
                    }
                }
            }
            int step = (int) ((mean - lowerLimit) / 3);
            if (step < 1) {
                step = 1;
            }
            lowerLimit += step;
        }
        // The new min. freq. may have changed
        minFreq = Long.MAX_VALUE;
        freqIt = frequencies.valuesIterator();
        while (freqIt.hasNext()) {
            stemFreq = frequencies.getCount(freqIt.next());
            if (stemFreq < minFreq) {
                minFreq = stemFreq;
            }
        }
    }
    // Scale and create tag objects
    double scaleFactor;
    if (maxFreq == minFreq) {
        scaleFactor = (double) (maxWeight - minWeight) / 4; // TODO: a realistic
        // scale factor in this
        // case
    } else {
        scaleFactor = (double) (maxWeight - minWeight) / (maxFreq - minFreq);
    }
    freqIt = frequencies.valuesIterator();
    int weight;
    while (freqIt.hasNext()) {
        wordStem = (String) freqIt.next();
        stemFreq = frequencies.getCount(wordStem);
        // Might still be some left less than the min. threshold
        if (stemFreq <= minFreq) {
            weight = minWeight;
        } else {
            weight = (int) (Math.ceil((double) (stemFreq - minFreq) * scaleFactor) + minWeight);
        }
        SortedSet<WordInfo> origins = this.stemOriginMap.get(wordStem);
        String mostCommonOrigin = origins.last().getName();
        Set<String> synonyms = new TreeSet<String>();
        for (WordInfo origin : origins) {
            synonyms.add(origin.getName());
        }
        WordInfo word = new Word(mostCommonOrigin, weight, synonyms);
        this.words.add(word);
    }
}

From source file:geogebra.kernel.AlgoFrequency.java

protected final void compute() {

    // Validate input arguments
    //=======================================================

    if (!dataList.isDefined() || dataList.size() == 0) {
        frequency.setUndefined();//from w ww.  j  av  a2s.co  m
        return;
    }

    if (!(dataList.getElementType() == GeoElement.GEO_CLASS_TEXT
            || dataList.getElementType() == GeoElement.GEO_CLASS_NUMERIC)) {
        frequency.setUndefined();
        return;
    }

    if (classList != null) {
        if (classList.getElementType() != GeoElement.GEO_CLASS_NUMERIC || classList.size() < 2) {
            frequency.setUndefined();
            return;
        }
    }

    if (density != null) {
        if (density.getDouble() <= 0) {
            frequency.setUndefined();
            return;
        }
    }

    frequency.setDefined(true);
    frequency.clear();
    if (value != null)
        value.clear();

    double numMax = 0, numMin = 0;
    boolean doCumulative = isCumulative != null && isCumulative.getBoolean();

    // Load the data into f, an instance of Frequency class 
    //=======================================================

    Frequency f = new Frequency();
    for (int i = 0; i < dataList.size(); i++) {
        if (dataList.getElementType() == GeoElement.GEO_CLASS_TEXT)
            f.addValue(((GeoText) dataList.get(i)).toValueString());
        if (dataList.getElementType() == GeoElement.GEO_CLASS_NUMERIC)
            f.addValue(((GeoNumeric) dataList.get(i)).getDouble());
    }

    // If classList does not exist, 
    // get the unique value list and compute frequencies for this list  
    //=======================================================

    // handle string data
    if (dataList.getElementType() == GeoElement.GEO_CLASS_TEXT) {

        Iterator itr = f.valuesIterator();
        String strMax = (String) itr.next();
        String strMin = strMax;
        itr = f.valuesIterator();

        while (itr.hasNext()) {
            String s = (String) itr.next();
            if (s.compareTo(strMax) > 0)
                strMax = s;
            if (s.compareTo(strMin) < 0)
                strMin = s;
            GeoText text = new GeoText(cons);
            text.setTextString(s);
            value.add(text);
            if (classList == null)
                if (doCumulative)
                    frequency.add(new GeoNumeric(cons, f.getCumFreq((Comparable) s)));
                else
                    frequency.add(new GeoNumeric(cons, f.getCount((Comparable) s)));
        }
    }

    // handle numeric data
    else {
        Iterator itr = f.valuesIterator();
        numMax = (Double) itr.next();
        numMin = numMax;
        itr = f.valuesIterator();

        while (itr.hasNext()) {
            Double n = (Double) itr.next();
            if (n > numMax)
                numMax = n.doubleValue();
            if (n < numMin)
                numMin = n.doubleValue();
            value.add(new GeoNumeric(cons, n));

            if (classList == null)
                if (doCumulative)
                    frequency.add(new GeoNumeric(cons, f.getCumFreq((Comparable) n)));
                else
                    frequency.add(new GeoNumeric(cons, f.getCount((Comparable) n)));
        }
    }

    // If classList exists, compute frequencies using the classList
    //=======================================================

    if (classList != null) {

        double lowerClassBound = 0;
        double upperClassBound = 0;
        double classFreq = 0;

        //set density conditions
        boolean hasDensity = false;
        if (useDensity != null)
            hasDensity = useDensity.getBoolean();

        double densityValue = 1; // default density
        if (density != null) {
            densityValue = density.getDouble();
        }

        double cumulativeClassFreq = 0;
        double swap;
        int length = classList.size();
        for (int i = 1; i < length; i++) {

            lowerClassBound = ((GeoNumeric) classList.get(i - 1)).getDouble();
            upperClassBound = ((GeoNumeric) classList.get(i)).getDouble();
            boolean increasing = true;
            if (lowerClassBound > upperClassBound) {
                swap = upperClassBound;
                upperClassBound = lowerClassBound;
                lowerClassBound = swap;
                increasing = false;
            }
            classFreq = f.getCumFreq((Comparable) upperClassBound) - f.getCumFreq((Comparable) lowerClassBound)
                    + f.getCount((Comparable) lowerClassBound);
            if ((i != length - 1 && increasing) || (i != 1 && !increasing))
                classFreq -= f.getCount((Comparable) upperClassBound);

            //   System.out.println(" =================================");
            //   System.out.println("class freq: " + classFreq + "   " + density);
            if (hasDensity) {
                classFreq = densityValue * classFreq / (upperClassBound - lowerClassBound);
            }
            if (doCumulative)
                cumulativeClassFreq += classFreq;
            //   System.out.println("class freq: " + classFreq);

            // add the frequency to the output GeoList
            frequency.add(new GeoNumeric(cons, doCumulative ? cumulativeClassFreq : classFreq));

        }

        // handle the last (highest) class frequency specially
        // it must also count values equal to the highest class bound  

    }
}

From source file:geogebra.common.kernel.statistics.AlgoFrequency.java

@Override
public final void compute() {

    if (isContingencyTable) {
        computeContingencyTable();//from   w w  w.  j  av  a  2s.  com
        return;
    }

    // Validate input arguments
    // =======================================================

    if (!dataList.isDefined() || dataList.size() == 0) {
        frequency.setUndefined();
        return;
    }

    if (!(dataList.getElementType().equals(GeoClass.TEXT)
            || dataList.getElementType().equals(GeoClass.NUMERIC))) {
        frequency.setUndefined();
        return;
    }

    if (classList != null) {
        if (!classList.getElementType().equals(GeoClass.NUMERIC) || classList.size() < 2) {
            frequency.setUndefined();
            return;
        }
    }

    if (density != null) {
        if (density.getDouble() <= 0) {
            frequency.setUndefined();
            return;
        }
    }

    if (scale != null) {
        if (!scale.isDefined()) {
            frequency.setUndefined();
            return;
        }
        scaleFactor = scale.getValue();
    }

    frequency.setDefined(true);
    frequency.clear();
    if (value != null)
        value.clear();

    double numMax = 0, numMin = 0;
    boolean doCumulative = isCumulative != null && isCumulative.getBoolean();

    // Load the data into f, an instance of Frequency class
    // =======================================================

    Frequency f = new FrequencyGgb();
    for (int i = 0; i < dataList.size(); i++) {
        if (dataList.getElementType().equals(GeoClass.TEXT))
            f.addValue(((GeoText) dataList.get(i)).toValueString(StringTemplate.defaultTemplate));
        if (dataList.getElementType().equals(GeoClass.NUMERIC))
            f.addValue(((GeoNumeric) dataList.get(i)).getDouble());
    }

    // If classList does not exist,
    // get the unique value list and compute frequencies for this list
    // =======================================================

    // handle string data
    if (dataList.getElementType().equals(GeoClass.TEXT)) {

        Iterator<Comparable<?>> itr = f.valuesIterator();
        String strMax = (String) itr.next();
        String strMin = strMax;
        itr = f.valuesIterator();

        while (itr.hasNext()) {
            String s = (String) itr.next();
            if (s.compareTo(strMax) > 0)
                strMax = s;
            if (s.compareTo(strMin) < 0)
                strMin = s;
            GeoText text = new GeoText(cons);
            text.setTextString(s);
            value.add(text);
            if (classList == null) {
                if (doCumulative) {
                    addValue(f.getCumFreq(s));
                } else {
                    addValue(f.getCount(s));
                }
            }
        }
    }

    // handle numeric data
    else {
        Iterator<Comparable<?>> itr = f.valuesIterator();
        numMax = (Double) itr.next();
        numMin = numMax;
        itr = f.valuesIterator();

        while (itr.hasNext()) {
            Double n = (Double) itr.next();
            if (n > numMax)
                numMax = n.doubleValue();
            if (n < numMin)
                numMin = n.doubleValue();
            value.add(new GeoNumeric(cons, n));

            if (classList == null)
                if (doCumulative)
                    addValue(f.getCumFreq(n));
                else
                    addValue(f.getCount(n));
        }
    }

    // If classList exists, compute frequencies using the classList
    // =======================================================

    if (classList != null) {

        double lowerClassBound = 0;
        double upperClassBound = 0;
        double classFreq = 0;

        // set density conditions
        boolean hasDensity = false;
        if (useDensity != null)
            hasDensity = useDensity.getBoolean();

        double densityValue = 1; // default density
        if (density != null) {
            densityValue = density.getDouble();
        }

        double cumulativeClassFreq = 0;
        double swap;
        int length = classList.size();
        for (int i = 1; i < length; i++) {

            lowerClassBound = ((GeoNumeric) classList.get(i - 1)).getDouble();
            upperClassBound = ((GeoNumeric) classList.get(i)).getDouble();

            // handle roundoff errror in class list values (this is possible
            // if auto-generated by another cmd)
            lowerClassBound = Kernel.checkDecimalFraction(lowerClassBound);
            upperClassBound = Kernel.checkDecimalFraction(upperClassBound);

            boolean increasing = true;
            if (lowerClassBound > upperClassBound) {
                swap = upperClassBound;
                upperClassBound = lowerClassBound;
                lowerClassBound = swap;
                increasing = false;
            }
            classFreq = f.getCumFreq(upperClassBound) - f.getCumFreq(lowerClassBound)
                    + f.getCount(lowerClassBound);
            if ((i != length - 1 && increasing) || (i != 1 && !increasing))
                classFreq -= f.getCount(upperClassBound);

            // System.out.println(" =================================");
            // System.out.println("class freq: " + classFreq + "   " +
            // density);

            if (doCumulative)
                cumulativeClassFreq += classFreq;

            // adjust the frequency and add to the output GeoList
            double v = doCumulative ? cumulativeClassFreq : classFreq;
            if (hasDensity) {
                v = densityValue * v / (upperClassBound - lowerClassBound);
            }
            addValue(v);
        }

        // handle the last (highest) class frequency specially
        // it must also count values equal to the highest class bound

    }
}