Example usage for weka.core Utils stableSort

List of usage examples for weka.core Utils stableSort

Introduction

In this page you can find the example usage for weka.core Utils stableSort.

Prototype

public staticint[] stableSort(double[] array) 

Source Link

Document

Sorts a given array of doubles in ascending order and returns an array of integers with the positions of the elements of the original array in the sorted array.

Usage

From source file:cba.Apriori.java

License:Open Source License

/**
 * Method that generates all large itemsets with a minimum support, and from
 * these all association rules with a minimum confidence.
 *
 * @param instances the instances to be used for generating the associations
 * @throws Exception if rules can't be built successfully
 *///from w  ww.j  a  va2s .c o  m
public void buildAssociations(Instances instances) throws Exception {

    double[] confidences, supports;
    int[] indices;
    FastVector[] sortedRuleSet;
    int necSupport = 0;

    instances = new Instances(instances);

    if (m_removeMissingCols) {
        instances = removeMissingColumns(instances);
    }
    if (m_car && m_metricType != CONFIDENCE)
        throw new Exception("For CAR-Mining metric type has to be confidence!");

    // only set class index if CAR is requested
    if (m_car) {
        if (m_classIndex == -1) {
            instances.setClassIndex(instances.numAttributes() - 1);
        } else if (m_classIndex <= instances.numAttributes() && m_classIndex > 0) {
            instances.setClassIndex(m_classIndex - 1);
        } else {
            throw new Exception("Invalid class index.");
        }
    }

    // can associator handle the data?
    getCapabilities().testWithFail(instances);

    m_cycles = 0;
    if (m_car) {
        //m_instances does not contain the class attribute
        m_instances = LabeledItemSet.divide(instances, false);

        //m_onlyClass contains only the class attribute
        m_onlyClass = LabeledItemSet.divide(instances, true);
    } else
        m_instances = instances;

    if (m_car && m_numRules == Integer.MAX_VALUE) {
        // Set desired minimum support
        m_minSupport = m_lowerBoundMinSupport;
    } else {
        // Decrease minimum support until desired number of rules found.
        m_minSupport = m_upperBoundMinSupport - m_delta;
        m_minSupport = (m_minSupport < m_lowerBoundMinSupport) ? m_lowerBoundMinSupport : m_minSupport;
    }

    do {

        // Reserve space for variables
        m_Ls = new FastVector();
        m_hashtables = new FastVector();
        m_allTheRules = new FastVector[6];
        m_allTheRules[0] = new FastVector();
        m_allTheRules[1] = new FastVector();
        m_allTheRules[2] = new FastVector();
        if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
            m_allTheRules[3] = new FastVector();
            m_allTheRules[4] = new FastVector();
            m_allTheRules[5] = new FastVector();
        }
        sortedRuleSet = new FastVector[6];
        sortedRuleSet[0] = new FastVector();
        sortedRuleSet[1] = new FastVector();
        sortedRuleSet[2] = new FastVector();
        if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
            sortedRuleSet[3] = new FastVector();
            sortedRuleSet[4] = new FastVector();
            sortedRuleSet[5] = new FastVector();
        }
        if (!m_car) {
            // Find large itemsets and rules
            findLargeItemSets();
            if (m_significanceLevel != -1 || m_metricType != CONFIDENCE)
                findRulesBruteForce();
            else
                findRulesQuickly();
        } else {
            findLargeCarItemSets();
            findCarRulesQuickly();
        }

        // Sort rules according to their support
        /* supports = new double[m_allTheRules[2].size()];
         for (int i = 0; i < m_allTheRules[2].size(); i++) 
        supports[i] = (double)((AprioriItemSet)m_allTheRules[1].elementAt(i)).support();
         indices = Utils.stableSort(supports);
         for (int i = 0; i < m_allTheRules[2].size(); i++) {
        sortedRuleSet[0].addElement(m_allTheRules[0].elementAt(indices[i]));
        sortedRuleSet[1].addElement(m_allTheRules[1].elementAt(indices[i]));
        sortedRuleSet[2].addElement(m_allTheRules[2].elementAt(indices[i]));
        if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
        sortedRuleSet[3].addElement(m_allTheRules[3].elementAt(indices[i]));
        sortedRuleSet[4].addElement(m_allTheRules[4].elementAt(indices[i]));
        sortedRuleSet[5].addElement(m_allTheRules[5].elementAt(indices[i]));
        }
         }*/

        int j = m_allTheRules[2].size() - 1;
        supports = new double[m_allTheRules[2].size()];
        for (int i = 0; i < (j + 1); i++)
            supports[j - i] = ((double) ((ItemSet) m_allTheRules[1].elementAt(j - i)).support()) * (-1);
        indices = Utils.stableSort(supports);
        for (int i = 0; i < (j + 1); i++) {
            sortedRuleSet[0].addElement(m_allTheRules[0].elementAt(indices[j - i]));
            sortedRuleSet[1].addElement(m_allTheRules[1].elementAt(indices[j - i]));
            sortedRuleSet[2].addElement(m_allTheRules[2].elementAt(indices[j - i]));
            if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
                sortedRuleSet[3].addElement(m_allTheRules[3].elementAt(indices[j - i]));
                sortedRuleSet[4].addElement(m_allTheRules[4].elementAt(indices[j - i]));
                sortedRuleSet[5].addElement(m_allTheRules[5].elementAt(indices[j - i]));
            }
        }

        // Sort rules according to their confidence
        m_allTheRules[0].removeAllElements();
        m_allTheRules[1].removeAllElements();
        m_allTheRules[2].removeAllElements();
        if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
            m_allTheRules[3].removeAllElements();
            m_allTheRules[4].removeAllElements();
            m_allTheRules[5].removeAllElements();
        }
        confidences = new double[sortedRuleSet[2].size()];
        int sortType = 2 + m_metricType;

        for (int i = 0; i < sortedRuleSet[2].size(); i++)
            confidences[i] = ((Double) sortedRuleSet[sortType].elementAt(i)).doubleValue();
        indices = Utils.stableSort(confidences);
        for (int i = sortedRuleSet[0].size() - 1; (i >= (sortedRuleSet[0].size() - m_numRules))
                && (i >= 0); i--) {
            m_allTheRules[0].addElement(sortedRuleSet[0].elementAt(indices[i]));
            m_allTheRules[1].addElement(sortedRuleSet[1].elementAt(indices[i]));
            m_allTheRules[2].addElement(sortedRuleSet[2].elementAt(indices[i]));
            if (m_metricType != CONFIDENCE || m_significanceLevel != -1) {
                m_allTheRules[3].addElement(sortedRuleSet[3].elementAt(indices[i]));
                m_allTheRules[4].addElement(sortedRuleSet[4].elementAt(indices[i]));
                m_allTheRules[5].addElement(sortedRuleSet[5].elementAt(indices[i]));
            }
        }

        if (m_verbose) {
            if (m_Ls.size() > 1) {
                System.out.println(toString());
            }
        }
        if (m_minSupport == m_lowerBoundMinSupport || m_minSupport - m_delta > m_lowerBoundMinSupport)
            m_minSupport -= m_delta;
        else
            m_minSupport = m_lowerBoundMinSupport;

        necSupport = Math.round((float) ((m_minSupport * (double) m_instances.numInstances()) + 0.5));

        m_cycles++;
    } while ((m_allTheRules[0].size() < m_numRules) && (Utils.grOrEq(m_minSupport, m_lowerBoundMinSupport))
    /*        (necSupport >= lowerBoundNumInstancesSupport)*/
    /*        (Utils.grOrEq(m_minSupport, m_lowerBoundMinSupport)) */ && (necSupport >= 1));
    m_minSupport += m_delta;
}

From source file:Classifiers.BRkNN.java

License:Open Source License

/**
 * used for BRkNN-b (break ties arbitrarily)
 *
 * @param confidences the probabilities for each label
 * @return a bipartition/*w w  w  .j  a  va  2 s.c  om*/
 */
protected boolean[] labelsFromConfidences3(double[] confidences) {
    boolean[] bipartition = new boolean[numLabels];

    int[] indices = Utils.stableSort(confidences);

    ArrayList<Integer> lastindices = new ArrayList<Integer>();

    int counter = 0;
    int i = numLabels - 1;

    while (i > 0) {
        if (confidences[indices[i]] > confidences[indices[numLabels - avgPredictedLabels]]) {
            bipartition[indices[i]] = true;
            counter++;
        } else if (confidences[indices[i]] == confidences[indices[numLabels - avgPredictedLabels]]) {
            lastindices.add(indices[i]);
        } else {
            break;
        }
        i--;
    }

    int size = lastindices.size();

    int j = avgPredictedLabels - counter;
    while (j > 0) {
        int next = random.nextInt(size);
        if (bipartition[lastindices.get(next)] != true) {
            bipartition[lastindices.get(next)] = true;
            j--;
        }
    }

    return bipartition;
}

From source file:com.entopix.maui.filters.MauiFilter.java

License:Open Source License

/**
 * Converts an instance./*w w w  .ja va  2s  .co m*/
 */
private FastVector convertInstance(Instance instance, boolean training) {

    FastVector vector = new FastVector();

    String fileName = instance.stringValue(fileNameAtt);

    if (debugMode) {
        log.info("-- Converting instance for document " + fileName);
    }

    // Get the key phrases for the document
    HashMap<String, Counter> hashKeyphrases = null;

    if (!instance.isMissing(keyphrasesAtt)) {
        String keyphrases = instance.stringValue(keyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases);
    }

    // Get the document text
    String documentText = instance.stringValue(documentAtt);

    // Compute the candidate topics
    HashMap<String, Candidate> candidateList;
    if (allCandidates != null && allCandidates.containsKey(instance)) {
        candidateList = allCandidates.get(instance);
    } else {
        candidateList = getCandidates(documentText);
    }
    if (debugMode) {
        log.info(candidateList.size() + " candidates ");
    }

    // Set indices for key attributes
    int tfidfAttIndex = documentAtt + 2;
    int distAttIndex = documentAtt + 3;
    int probsAttIndex = documentAtt + numFeatures;

    int countPos = 0;
    int countNeg = 0;

    // Go through the phrases and convert them into instances
    for (Candidate candidate : candidateList.values()) {

        if (candidate.getFrequency() < minOccurFrequency) {
            continue;
        }

        String name = candidate.getName();
        String orig = candidate.getBestFullForm();
        if (!vocabularyName.equals("none")) {
            orig = candidate.getTitle();
        }

        double[] vals = computeFeatureValues(candidate, training, hashKeyphrases, candidateList);

        Instance inst = new Instance(instance.weight(), vals);

        inst.setDataset(classifierData);

        double[] probs = null;
        try {
            // Get probability of a phrase being key phrase
            probs = classifier.distributionForInstance(inst);
        } catch (Exception e) {
            log.error("Exception while getting probability for candidate " + candidate.getName());
            continue;
        }

        double prob = probs[0];
        if (nominalClassValue) {
            prob = probs[1];
        }

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures + 2];

        int pos = 0;
        for (int i = 1; i < instance.numAttributes(); i++) {

            if (i == documentAtt) {

                // output of values for a given phrase:

                // 0 Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(name);
                newInst[pos++] = index;

                // 1 Add original version
                if (orig != null) {
                    index = outputFormatPeek().attribute(pos).addStringValue(orig);
                } else {
                    index = outputFormatPeek().attribute(pos).addStringValue(name);
                }

                // 2
                newInst[pos++] = index;

                // Add features
                newInst[pos++] = inst.value(tfIndex); // 3
                newInst[pos++] = inst.value(idfIndex); // 4
                newInst[pos++] = inst.value(tfidfIndex); // 5
                newInst[pos++] = inst.value(firstOccurIndex); // 6
                newInst[pos++] = inst.value(lastOccurIndex); // 7
                newInst[pos++] = inst.value(spreadOccurIndex); // 8
                newInst[pos++] = inst.value(domainKeyphIndex); // 9
                newInst[pos++] = inst.value(lengthIndex); // 10 
                newInst[pos++] = inst.value(generalityIndex); // 11
                newInst[pos++] = inst.value(nodeDegreeIndex); // 12
                newInst[pos++] = inst.value(invWikipFreqIndex); // 13
                newInst[pos++] = inst.value(totalWikipKeyphrIndex); // 14
                newInst[pos++] = inst.value(wikipGeneralityIndex); // 15

                // Add probability
                probsAttIndex = pos;
                newInst[pos++] = prob; // 16

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue(); // 17

            } else if (i == keyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }

        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);

        if (inst.classValue() == 0) {
            countNeg++;
        } else {
            countPos++;
        }

    }
    if (debugMode) {
        log.info(countPos + " positive; " + countNeg + " negative instances");
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);

        // log.info(vals[i] + "\t" + currentInstance);

        // Short cut: if phrase very unlikely make rank very low and
        // continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current
        // phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        currentInstance.setValue(probsAttIndex + 1, rank++);

    }

    return vector;
}

From source file:com.openkm.kea.filter.KEAFilter.java

License:Open Source License

/**
 * Converts an instance.//from   w w  w . j a va  2 s  .  c  o m
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        log.info("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap<String, Counter> hashKeyphrases = null;
    HashMap<String, Counter> hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap<String, FastVector> hash = new HashMap<String, FastVector>();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));
    //   hash = getComposits(hash);

    /* Experimental:
     To compute how many of the manual keyphrases appear in the documents:
            
    log.info("Doc phrases found " + hash.size());
    log.info("Manual keyphrases: ");
    Iterator iter = hashKeyphrases.keySet().iterator();
    int count = 0;
    while (iter.hasNext()) {
       String id = (String)iter.next();
       if (hash.containsKey(id)) {
    count++;
       }
    }
            
    double max_recall = (double)count/(double)hashKeyphrases.size();
            
            
    m_max_recall += max_recall;
    doc++;
    double avg_m_max_recall = m_max_recall/(double)doc;
            
    String file = instance.stringValue(2);
    log.info(count + " out of " + hashKeyphrases.size() + " are in the document ");
    log.info("Max recall : " + avg_m_max_recall + " on " + doc + " documents ");
    */

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }
    if (m_STDEVfeature) {
        numFeatures = numFeatures + 1;
    }
    if (m_NODEfeature) {
        numFeatures = numFeatures + 1;
    }
    if (m_LENGTHfeature) {
        numFeatures = numFeatures + 1;
    }

    // Set indices of key attributes
    //int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;
    //int classAttIndex = numFeatures;

    // Go through the phrases and convert them into instances
    Iterator<String> it = hash.keySet().iterator();
    while (it.hasNext()) {
        String id = it.next();
        FastVector phraseInfo = (FastVector) hash.get(id);

        double[] vals = featVals(id, phraseInfo, training, hashKeysEval, hashKeyphrases, length, hash);

        Instance inst = new Instance(instance.weight(), vals);

        inst.setDataset(m_ClassifierData);

        // Get probability of a phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);

        // If simple Naive Bayes used, change here to
        //double prob = probs[1];
        double prob = probs[0];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // output of values for a given phrase:

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(id);
                newInst[pos++] = index;

                // Add original version
                String orig = (String) phraseInfo.elementAt(2);

                if (orig != null) {
                    index = outputFormatPeek().attribute(pos).addStringValue(orig);
                } else {
                    index = outputFormatPeek().attribute(pos).addStringValue(id);
                }
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }
                if (m_STDEVfeature) {
                    newInst[pos++] = inst.value(m_STDEVIndex);
                }
                if (m_NODEfeature) {
                    newInst[pos++] = inst.value(m_NodeIndex);
                }
                if (m_LENGTHfeature) {
                    newInst[pos++] = inst.value(m_LengthIndex);
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();

            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator<String> phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {
                    // log.info("Here: " + phrase);
                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }
                    if (m_STDEVfeature) {
                        newInst[pos++] = Instance.missingValue();
                    }
                    if (m_NODEfeature) {
                        newInst[pos++] = Instance.missingValue();
                    }
                    if (m_LENGTHfeature) {
                        newInst[pos++] = Instance.missingValue();
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    // newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }

                Instance inst = new Instance(instance.weight(), newInst);
                inst.setDataset(outputFormatPeek());
                vector.addElement(inst);
            }

        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);
        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        currentInstance.setValue(probsAttIndex + 1, rank++);

    }
    return vector;
}

From source file:data.statistics.MLStatistics.java

License:Open Source License

/**
 * Returns the indices of the labels that have the strongest Phi correlation
 * with the label which is given as a parameter. The second parameter is the
 * number of labels that will be returned. It requires the method
 * calculatePhi to be previously called.
 *
 * @param labelIndex// w w  w  .  java  2s  .  c om
 *            The label index.
 * @param k
 *            The number of labels that will be returned. The number of
 *            labels that will be returned.
 * @return The indices of the k most correlated labels.
 */
public int[] topPhiCorrelatedLabels(int labelIndex, int k) {
    // create a new array containing the absolute values of the original
    // array
    double[] absCorrelations = new double[numLabels];
    for (int i = 0; i < numLabels; i++) {
        absCorrelations[i] = Math.abs(phi[labelIndex][i]);
    }
    // sort the array of correlations
    int[] sorted = Utils.stableSort(absCorrelations);

    int[] topPhiCorrelated = new int[k + 1];
    // the k last values of the sorted array are the indices of the top k
    // correlated labels
    for (int i = 0; i < k; i++) {
        topPhiCorrelated[i] = sorted[numLabels - 1 - i];
    }
    // one more for the class
    topPhiCorrelated[k] = numLabels;

    return topPhiCorrelated;
}

From source file:filters.MauiFilter.java

License:Open Source License

/**
 * Converts an instance./*w w  w  .  j ava2 s .c o m*/
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    String fileName = instance.stringValue(fileNameAtt);

    if (debugMode) {
        System.err.println("-- Converting instance for document " + fileName);
    }

    // Get the key phrases for the document
    HashMap<String, Counter> hashKeyphrases = null;

    if (!instance.isMissing(keyphrasesAtt)) {
        String keyphrases = instance.stringValue(keyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases);
    }

    // Get the document text
    String documentText = instance.stringValue(documentAtt);

    // Compute the candidate topics
    HashMap<String, Candidate> candidateList;
    if (allCandidates != null && allCandidates.containsKey(instance)) {
        candidateList = allCandidates.get(instance);
    } else {
        candidateList = getCandidates(documentText);
    }
    if (debugMode) {
        System.err.println(candidateList.size() + " candidates ");
    }

    // Set indices for key attributes
    int tfidfAttIndex = documentAtt + 2;
    int distAttIndex = documentAtt + 3;
    int probsAttIndex = documentAtt + numFeatures;

    int countPos = 0;
    int countNeg = 0;

    // Go through the phrases and convert them into instances
    for (Candidate candidate : candidateList.values()) {

        if (candidate.getFrequency() < minOccurFrequency) {
            continue;
        }

        String name = candidate.getName();
        String orig = candidate.getBestFullForm();
        if (!vocabularyName.equals("none")) {
            orig = candidate.getTitle();
        }

        double[] vals = computeFeatureValues(candidate, training, hashKeyphrases, candidateList);

        Instance inst = new Instance(instance.weight(), vals);

        inst.setDataset(classifierData);

        // Get probability of a phrase being key phrase
        double[] probs = classifier.distributionForInstance(inst);

        double prob = probs[0];
        if (nominalClassValue) {
            prob = probs[1];
        }

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures + 2];

        int pos = 0;
        for (int i = 1; i < instance.numAttributes(); i++) {

            if (i == documentAtt) {

                // output of values for a given phrase:

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(name);
                newInst[pos++] = index;

                // Add original version
                if (orig != null) {
                    index = outputFormatPeek().attribute(pos).addStringValue(orig);
                } else {
                    index = outputFormatPeek().attribute(pos).addStringValue(name);
                }

                newInst[pos++] = index;

                // Add features
                newInst[pos++] = inst.value(tfIndex);
                newInst[pos++] = inst.value(idfIndex);
                newInst[pos++] = inst.value(tfidfIndex);
                newInst[pos++] = inst.value(firstOccurIndex);
                newInst[pos++] = inst.value(lastOccurIndex);
                newInst[pos++] = inst.value(spreadOccurIndex);
                newInst[pos++] = inst.value(domainKeyphIndex);
                newInst[pos++] = inst.value(lengthIndex);
                newInst[pos++] = inst.value(generalityIndex);
                newInst[pos++] = inst.value(nodeDegreeIndex);
                newInst[pos++] = inst.value(semRelIndex);
                newInst[pos++] = inst.value(wikipKeyphrIndex);
                newInst[pos++] = inst.value(invWikipFreqIndex);
                newInst[pos++] = inst.value(totalWikipKeyphrIndex);

                // Add probability
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();

            } else if (i == keyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }

        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);

        if (inst.classValue() == 0) {
            countNeg++;
        } else {
            countPos++;
        }
    }
    if (debugMode) {
        System.err.println(countPos + " positive; " + countNeg + " negative instances");
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);
        // Short cut: if phrase very unlikely make rank very low and
        // continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current
        // phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        currentInstance.setValue(probsAttIndex + 1, rank++);

    }

    return vector;
}

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Converts an instance.//from   ww  w. ja va2  s  .  c om
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        System.err.println("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap hashKeyphrases = null;
    HashMap hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap hash = new HashMap();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }

    // Set indices of key attributes
    int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;

    // Go through the phrases and convert them into instances
    Iterator it = hash.keySet().iterator();
    while (it.hasNext()) {
        String phrase = (String) it.next();
        FastVector phraseInfo = (FastVector) hash.get(phrase);
        double[] vals = featVals(phrase, phraseInfo, training, hashKeysEval, hashKeyphrases, length);
        Instance inst = new Instance(instance.weight(), vals);
        inst.setDataset(m_ClassifierData);

        // Get probability of phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);
        double prob = probs[1];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                newInst[pos++] = index;

                // Add original version
                index = outputFormatPeek().attribute(pos).addStringValue((String) phraseInfo.elementAt(2));
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();
            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = (String) phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {

                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue((String) hashKeysEval.get(phrase));
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }
            }
            Instance inst = new Instance(instance.weight(), newInst);
            inst.setDataset(outputFormatPeek());
            vector.addElement(inst);
        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);

        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        String val = currentInstance.stringValue(phraseAttIndex);
        boolean foundSuperphrase = false;
        for (int j = startInd - 1; j >= 0; j--) {
            if (j != i) {
                Instance candidate = (Instance) vector.elementAt(j);
                String potSuperphrase = candidate.stringValue(phraseAttIndex);
                if (val.length() <= potSuperphrase.length()) {
                    if (KEAFilter.contains(val, potSuperphrase)) {
                        foundSuperphrase = true;
                        break;
                    }
                }
            }
        }
        if (foundSuperphrase) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
        } else {
            currentInstance.setValue(probsAttIndex + 1, rank++);
        }
    }
    return vector;
}

From source file:mulan.data.Statistics.java

License:Open Source License

/**
 * Returns the indices of the labels that have the strongest phi correlation
 * with the label which is given as a parameter. The second parameter is
 * the number of labels that will be returned.
 *
 * @param labelIndex/*  w w w . jav  a  2  s. c  o m*/
 * @param k
 * @return the indices of the k most correlated labels
 */
public int[] topPhiCorrelatedLabels(int labelIndex, int k) {
    //create a new array containing the absolute values of the original array
    double[] absCorrelations = new double[numLabels];
    for (int i = 0; i < numLabels; i++) {
        absCorrelations[i] = Math.abs(phi[labelIndex][i]);
    }
    //sort the array of correlations
    int[] sorted = Utils.stableSort(absCorrelations);

    int[] topPhiCorrelated = new int[k + 1];
    //the k last values of the sorted array are the indices of the top k correlated labels
    for (int i = 0; i < k; i++) {
        topPhiCorrelated[i] = sorted[numLabels - 1 - i];
    }
    // one more for the class
    topPhiCorrelated[k] = numLabels;

    return topPhiCorrelated;
}

From source file:outlier.INNE.java

License:Open Source License

private void calAUC() {
    long tp = 0L;
    long fp = 0L;
    auc = 0.0D;/*from  w  ww .j a  va  2  s .c o  m*/

    int[] idx = Utils.stableSort(this.scores);

    for (int i = 0; i < this.numInstances; i++) {
        if (instances.instance(idx[i]).value(numAttributes) == 1.0D) {
            tp += 1L;
        } else {
            auc += tp;
            fp += 1L;
        }
    }

    auc /= (tp * fp);
    System.out.println("AUC : " + auc);
}