Example usage for weka.core Instance stringValue

List of usage examples for weka.core Instance stringValue

Introduction

In this page you can find the example usage for weka.core Instance stringValue.

Prototype

public String stringValue(Attribute att);

Source Link

Document

Returns the value of a nominal, string, date, or relational attribute for the instance as a string.

Usage

From source file:etc.aloe.filters.WordFeaturesExtractor.java

License:Open Source License

@Override
protected Instance process(Instance instance) throws Exception {
    if (selectedAttributeIndex < 0) {
        throw new IllegalStateException("String attribute not set");
    }//ww w. j  a  va 2  s .  c o  m

    int numOldValues = instance.numAttributes();
    int numNewFeatures = unigrams.size() + bigrams.size();
    double[] newValues = new double[numOldValues + numNewFeatures];

    // Copy all attributes from input to output
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (getInputFormat().attribute(i).type() != Attribute.STRING) {
            // Add simple nominal and numeric attributes directly
            if (instance.value(i) != 0.0) {
                newValues[i] = instance.value(i);
            }
        } else {
            if (instance.isMissing(i)) {
                newValues[i] = Utils.missingValue();
            } else {

                // If this is a string attribute, we have to first add
                // this value to the range of possible values, then add
                // its new internal index.
                if (outputFormatPeek().attribute(i).numValues() == 0) {
                    // Note that the first string value in a
                    // SparseInstance doesn't get printed.
                    outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug");
                }
                int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i));
                newValues[i] = newIndex;
            }
        }
    }

    String stringValue = instance.stringValue(selectedAttributeIndex);
    if (instance.isMissing(selectedAttributeIndex) == false) {

        List<String> words = tokenizeDocument(instance);
        Set<String> wordSet = new HashSet<String>(words);

        for (int i = 0; i < unigrams.size(); i++) {
            String unigram = unigrams.get(i);
            int count = 0;
            if (wordSet.contains(unigram)) {
                //Count the times the word is in the document
                for (int w = 0; w < words.size(); w++) {
                    if (words.get(w).equals(unigram)) {
                        count += 1;
                    }
                }
            }

            int featureIndex = numOldValues + i;
            newValues[featureIndex] = count;
        }

        for (int i = 0; i < bigrams.size(); i++) {
            Bigram bigram = bigrams.get(i);
            int count = bigram.getTimesInDocument(words);
            int featureIndex = numOldValues + unigrams.size() + i;
            newValues[featureIndex] = count;
        }
    }

    Instance result = new SparseInstance(instance.weight(), newValues);
    return result;
}

From source file:facebookpostpuller.RemoveEntriesGUI.java

private void btnDeletePostsActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_btnDeletePostsActionPerformed
    // TODO add your handling code here:
    String name = txtFacebookName.getText();
    btnDeletePosts.setVisible(false);// w w  w .j  a v a2  s.c om
    btnSaveArff.setVisible(false);
    btnLoadArff.setVisible(false);
    int ctr = 0;
    for (int i = data.numInstances() - 1; i >= 0; i--) {
        Instance inst = data.instance(i);
        if (inst.stringValue(0).equals(name)) {
            System.out.println("Deleted!");
            data.delete(i);
            ctr++;
        }
    }
    btnDeletePosts.setVisible(true);
    btnSaveArff.setVisible(true);
    btnLoadArff.setVisible(true);
    txtOutput.append(name + ": " + ctr + " posts deleted!\n");
}

From source file:filters.MauiFilter.java

License:Open Source License

private void selectCandidates() throws Exception {

    if (debugMode) {
        System.err.println("--- Computing candidates...");
    }// w  w  w .  ja va2 s . c om

    allCandidates = new HashMap<Instance, HashMap<String, Candidate>>();

    // Convert pending input instances into data for classifier
    int totalDocuments = getInputFormat().numInstances();
    for (int i = 0; i < totalDocuments; i++) {

        Instance current = getInputFormat().instance(i);

        String fileName = current.stringValue(fileNameAtt);
        int j = i + 1;
        if (debugMode) {
            System.err.println(
                    "---- Processing document " + fileName + ", " + j + " out of " + totalDocuments + "...");
        }

        // Get the phrases for the document
        String documentText = current.stringValue(documentAtt);

        HashMap<String, Candidate> candidateList = getCandidates(documentText);

        if (debugMode) {
            System.err.println("---- " + candidateList.size() + " candidates");
        }
        allCandidates.put(current, candidateList);
    }

}

From source file:filters.MauiFilter.java

License:Open Source License

/**
 * Builds the classifier.// w  w w. j av  a2  s .c om
 */
private void buildClassifier() throws Exception {

    // Generate input format for classifier
    FastVector atts = new FastVector();
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (i == documentAtt) {
            atts.addElement(new Attribute("Term_frequency")); // 2
            atts.addElement(new Attribute("IDF")); // 
            atts.addElement(new Attribute("TFxIDF")); // 
            atts.addElement(new Attribute("First_occurrence")); // 
            atts.addElement(new Attribute("Last_occurrence")); // 
            atts.addElement(new Attribute("Spread")); // 
            atts.addElement(new Attribute("Domain_keyphraseness")); // 
            atts.addElement(new Attribute("Length")); //
            atts.addElement(new Attribute("Generality")); //
            atts.addElement(new Attribute("Node_degree")); // 
            atts.addElement(new Attribute("Semantic_relatedness")); // 
            atts.addElement(new Attribute("Wikipedia_keyphraseness")); // 
            atts.addElement(new Attribute("Inverse_Wikip_frequency")); // 
            atts.addElement(new Attribute("Total_Wikip_keyphraseness")); // 13

        } else if (i == keyphrasesAtt) {
            if (nominalClassValue) {
                FastVector vals = new FastVector(2);
                vals.addElement("False");
                vals.addElement("True");
                atts.addElement(new Attribute("Keyphrase?", vals));
            } else {
                atts.addElement(new Attribute("Keyphrase?"));
            }
        }
    }

    classifierData = new Instances("ClassifierData", atts, 0);

    classifierData.setClassIndex(numFeatures);

    if (debugMode) {
        System.err.println("--- Converting instances for classifier");
    }
    int totalDocuments = getInputFormat().numInstances();
    // Convert pending input instances into data for classifier
    for (int i = 0; i < totalDocuments; i++) {
        Instance current = getInputFormat().instance(i);

        // Get the key phrases for the document
        String keyphrases = current.stringValue(keyphrasesAtt);
        HashMap<String, Counter> hashKeyphrases = getGivenKeyphrases(keyphrases);

        // Get the phrases for the document
        HashMap<String, Candidate> candidateList = allCandidates.get(current);

        // Compute the feature values for each phrase and
        // add the instance to the data for the classifier
        int countPos = 0;
        int countNeg = 0;

        if (debugMode) {
            System.err
                    .println("--- Computing features for document " + i + " out of " + totalDocuments + "...");
        }

        for (Candidate candidate : candidateList.values()) {

            // ignore all candidates that appear less than a threshold
            if (candidate.getFrequency() < minOccurFrequency) {
                continue;
            }

            // compute feature values
            double[] vals = computeFeatureValues(candidate, true, hashKeyphrases, candidateList);

            if (vals[vals.length - 1] == 0) {
                countNeg++;
            } else {
                countPos++;
            }
            Instance inst = new Instance(current.weight(), vals);
            // System.out.println(candidate + "\t" + inst);
            classifierData.add(inst);

        }
        if (debugMode) {
            System.err.println(countPos + " positive; " + countNeg + " negative instances");
        }
    }

    if (debugMode) {
        System.err.println("--- Building classifier");
    }

    if (classifier == null) {
        // Build classifier
        if (nominalClassValue) {

            //         FilteredClassifier fclass = new FilteredClassifier();
            //         fclass.setClassifier(new NaiveBayesSimple());
            //         fclass.setFilter(new Discretize());
            //         classifier = fclass;

            classifier = new Bagging(); // try also //
            classifier.setOptions(
                    Utils.splitOptions("-P 10 -S 1 -I 10 -W weka.classifiers.trees.J48 -- -U -M 2"));

        } else {

            classifier = new Bagging();
            // try also
            // classifier.setOptions(Utils.splitOptions("-P 10 -S 1 -I 10 -W
            // weka.classifiers.trees.J48 -- -U -M 2")) ;
            String optionsString = "-P 100 -S 1 -I 10 -W weka.classifiers.trees.M5P -- -U -M 7.0";
            String[] options = Utils.splitOptions(optionsString);
            classifier.setOptions(options);

        }
    }
    FileOutputStream out = new FileOutputStream(new File("19docs.arff"));
    PrintWriter printer = new PrintWriter(out);

    printer.write(classifierData.toString());

    printer.close();
    out.close();

    classifier.buildClassifier(classifierData);

    if (debugMode) {
        System.err.println(classifier);
    }

    // Save space
    classifierData = new Instances(classifierData, 0);
}

From source file:filters.MauiFilter.java

License:Open Source License

/**
 * Converts an instance.//www. java 2 s.  c o m
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    String fileName = instance.stringValue(fileNameAtt);

    if (debugMode) {
        System.err.println("-- Converting instance for document " + fileName);
    }

    // Get the key phrases for the document
    HashMap<String, Counter> hashKeyphrases = null;

    if (!instance.isMissing(keyphrasesAtt)) {
        String keyphrases = instance.stringValue(keyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases);
    }

    // Get the document text
    String documentText = instance.stringValue(documentAtt);

    // Compute the candidate topics
    HashMap<String, Candidate> candidateList;
    if (allCandidates != null && allCandidates.containsKey(instance)) {
        candidateList = allCandidates.get(instance);
    } else {
        candidateList = getCandidates(documentText);
    }
    if (debugMode) {
        System.err.println(candidateList.size() + " candidates ");
    }

    // Set indices for key attributes
    int tfidfAttIndex = documentAtt + 2;
    int distAttIndex = documentAtt + 3;
    int probsAttIndex = documentAtt + numFeatures;

    int countPos = 0;
    int countNeg = 0;

    // Go through the phrases and convert them into instances
    for (Candidate candidate : candidateList.values()) {

        if (candidate.getFrequency() < minOccurFrequency) {
            continue;
        }

        String name = candidate.getName();
        String orig = candidate.getBestFullForm();
        if (!vocabularyName.equals("none")) {
            orig = candidate.getTitle();
        }

        double[] vals = computeFeatureValues(candidate, training, hashKeyphrases, candidateList);

        Instance inst = new Instance(instance.weight(), vals);

        inst.setDataset(classifierData);

        // Get probability of a phrase being key phrase
        double[] probs = classifier.distributionForInstance(inst);

        double prob = probs[0];
        if (nominalClassValue) {
            prob = probs[1];
        }

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures + 2];

        int pos = 0;
        for (int i = 1; i < instance.numAttributes(); i++) {

            if (i == documentAtt) {

                // output of values for a given phrase:

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(name);
                newInst[pos++] = index;

                // Add original version
                if (orig != null) {
                    index = outputFormatPeek().attribute(pos).addStringValue(orig);
                } else {
                    index = outputFormatPeek().attribute(pos).addStringValue(name);
                }

                newInst[pos++] = index;

                // Add features
                newInst[pos++] = inst.value(tfIndex);
                newInst[pos++] = inst.value(idfIndex);
                newInst[pos++] = inst.value(tfidfIndex);
                newInst[pos++] = inst.value(firstOccurIndex);
                newInst[pos++] = inst.value(lastOccurIndex);
                newInst[pos++] = inst.value(spreadOccurIndex);
                newInst[pos++] = inst.value(domainKeyphIndex);
                newInst[pos++] = inst.value(lengthIndex);
                newInst[pos++] = inst.value(generalityIndex);
                newInst[pos++] = inst.value(nodeDegreeIndex);
                newInst[pos++] = inst.value(semRelIndex);
                newInst[pos++] = inst.value(wikipKeyphrIndex);
                newInst[pos++] = inst.value(invWikipFreqIndex);
                newInst[pos++] = inst.value(totalWikipKeyphrIndex);

                // Add probability
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();

            } else if (i == keyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }

        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);

        if (inst.classValue() == 0) {
            countNeg++;
        } else {
            countPos++;
        }
    }
    if (debugMode) {
        System.err.println(countPos + " positive; " + countNeg + " negative instances");
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);
        // Short cut: if phrase very unlikely make rank very low and
        // continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current
        // phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        currentInstance.setValue(probsAttIndex + 1, rank++);

    }

    return vector;
}

From source file:imba.classifier.NBTubes.java

@Override
public double[] distributionForInstance(Instance instance) throws Exception {
    //Fungsi ini menentukan probabilitas setiap kelas instance untuk instance 
    //yang ada di parameter fungsi
    Instances temp = null;// w ww  .  ja v  a  2 s  .  c o m
    Instance p;
    Filter f;
    double[] a = new double[infoClassifier.get(0).get(0).size()];
    int i, j, k, l, x, c;
    double t, prev;
    Enumeration n;
    boolean big;
    String val;
    String[] valMinMax;

    if (wasNumeric) {

        header_Instances.add(instance);

        f = new Normalize();
        try {
            f.setInputFormat(header_Instances);

            for (Instance i1 : header_Instances) {
                f.input(i1);
            }

            f.batchFinished();
        } catch (Exception ex) {
            Logger.getLogger(NBTubes.class.getName()).log(Level.SEVERE, null, ex);
        }

        temp = f.getOutputFormat();

        while ((p = f.output()) != null) {
            temp.add(p);
        }
    }

    f = new NumericToNominal();

    if (wasNumeric) {
        try {
            f.setInputFormat(temp);

            for (Instance i1 : temp) {
                f.input(i1);
            }

            f.batchFinished();
        } catch (Exception ex) {
            Logger.getLogger(NBTubes.class.getName()).log(Level.SEVERE, null, ex);
        }

        temp = null;
        temp = f.getOutputFormat();

        p = null;

        while ((p = f.output()) != null) {
            temp.add(p);
        }

        instance = temp.lastInstance();

        header_Instances.remove(header_Instances.size() - 1);
    } else {
        f.setInputFormat(header_Instances);

        f.input(instance);

        f.batchFinished();

        instance = f.output();
    }

    //Itung distribusi instance utk tiap kelas
    i = 0;
    while (i < (a.length)) {
        a[i] = (double) sumClass[i] / dataSize;

        j = 0;
        k = 0;
        while (j < infoClassifier.size()) {

            if (j == classIdx) {
                k++;
            }

            if (wasNumeric) {
                if (filter.equals("Discretize")) {
                    l = 0;
                    big = false;
                    while (l < dataset.attribute(k).numValues() && big == false) {
                        //parse
                        val = String.valueOf(dataset.attribute(k).value(l));
                        //System.out.println("k = " + k);
                        //System.out.println("nilai = " + instance.stringValue(k));
                        val = val.replaceAll("'", "");
                        val = val.replaceAll("\\(", "");
                        val = val.replaceAll("\\)", "");
                        val = val.replaceAll("]", "");

                        //System.out.println(val);

                        valMinMax = val.split("-");

                        //cocokin

                        if (valMinMax.length == 3) {
                            if (valMinMax[1].equals("inf")) {
                                valMinMax[1] = "0.0";
                            }
                            //System.out.println("Min = " + valMinMax[1]);
                            //System.out.println("Max = " + valMinMax[2]);
                            if (Double.valueOf(instance.stringValue(k)) > Double.valueOf(valMinMax[1]) && Double
                                    .valueOf(instance.stringValue(k)) <= Double.valueOf(valMinMax[2])) {
                                big = true;
                            }
                        } else {
                            if (valMinMax.length == 2) {
                                if (valMinMax[1].equals("inf")) {
                                    valMinMax[1] = "1.0";
                                }

                                if (Double.valueOf(instance.stringValue(k)) > Double.valueOf(valMinMax[0])
                                        && Double.valueOf(instance.stringValue(k)) <= Double
                                                .valueOf(valMinMax[1])) {
                                    big = true;
                                }
                            } else {
                                l = dataset.attribute(k).indexOfValue(instance.stringValue(k));
                                big = true;
                            }
                            //System.out.println("Min = " + valMinMax[0]);
                            //System.out.println("Max = " + valMinMax[1]);

                        }
                        l++;
                    }

                    x = l - 1;

                    //System.out.println("x = " + x);
                } else {
                    big = false;
                    l = 0;
                    n = dataset.attribute(k).enumerateValues();

                    t = 0;
                    prev = 0;
                    while (l < dataset.attribute(k).numValues() && big == false) {
                        t = Double.valueOf(n.nextElement().toString());

                        //System.out.println(prev + "   " + t);
                        if (Double.valueOf(instance.stringValue(k)) <= t) {
                            big = true;
                        } else {
                            prev = t;
                        }

                        l++;
                    }

                    if (big == true && t != Double.valueOf(instance.stringValue(k))) {
                        System.out.println(prev + "   " + Double.valueOf(instance.stringValue(k)) + "   " + t);
                    }
                    //x = l - 1;

                    if (classIdx < 2) {
                        c = 2;
                    } else {
                        c = 1;
                    }

                    if (big == true && l > c) {
                        if ((Double.valueOf(instance.stringValue(k)) - prev) <= (t
                                - Double.valueOf(instance.stringValue(k)))) {
                            x = l - 2;
                        } else {
                            x = l - 1;
                        }
                    } else {
                        x = l - 1;
                    }
                }

            } else {
                x = dataset.attribute(k).indexOfValue(instance.stringValue(k));
            }

            a[i] *= infoClassifier.get(j).get(x).get(i);

            k++;
            j++;
        }

        i++;
    }

    return a;
}

From source file:jjj.asap.sas.parser.job.ImportParserData.java

License:Open Source License

private void process(final String parent, int essaySet, Map<Double, List<String>> tags,
        Map<Double, List<String>> parseTrees, Map<Double, List<String>> depends) {

    // check if output exists
    boolean any = false;

    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-extra-stats.arff"))
        any = true;//  ww w. ja v  a2s  .  c om
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-pos-tags.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-parse-tree.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends0.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends1.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends2.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends3.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends4.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends5.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends6.arff"))
        any = true;

    if (!any) {
        Job.log("NOTE", "work/datasets/" + parent + "/" + essaySet
                + "-*.arff returns all required datasets - nothing to do");
        return;
    }

    // Load an existing dataset to use as a template.
    Instances dataset = Dataset.load("work/datasets/" + parent + "/" + essaySet + "-spell-checked.arff");

    // create the output datasets here. except for the extra statistics, 
    // the format is the same as 'dataset'.

    Instances tagsData = new Instances(dataset, 0);
    tagsData.setRelationName(essaySet + "-pos-tags.arff");
    Instances treeData = new Instances(dataset, 0);
    treeData.setRelationName(essaySet + "-parse-tree.arff");

    Instances dependsData[] = new Instances[7];
    for (int j = 0; j < 7; j++) {
        dependsData[j] = new Instances(dataset, 0);
        dependsData[j].setRelationName(essaySet + "-depends" + j + ".arff");
    }

    // extra stats
    DatasetBuilder builder = new DatasetBuilder();
    builder.addVariable("id");
    if (Contest.isMultiChoice(essaySet)) {
        builder.addNominalVariable("color", Contest.COLORS);
    }
    builder.addVariable("x_sent");
    builder.addVariable("x_para");
    builder.addVariable("x_length");
    builder.addVariable("x_words");
    builder.addVariable("x_unique_words");
    builder.addNominalVariable("score", Contest.getRubrics(essaySet));

    Instances extraStats = builder.getDataset(essaySet + "-extra-stats.arff");

    // now add rows for each instance

    for (int i = 0; i < dataset.numInstances(); i++) {

        // common variables
        Instance ob = dataset.instance(i);
        double id = ob.value(0);
        String y = ob.isMissing(dataset.numAttributes() - 1) ? null
                : ob.stringValue(dataset.numAttributes() - 1);
        String color = Contest.isMultiChoice(essaySet) ? ob.stringValue(dataset.attribute("color")) : null;
        String str = ob.stringValue(dataset.attribute("text"));

        //
        // Extra stats
        //

        int nSent = tags.containsKey(id) ? tags.get(id).size() : 0;
        int nPara = 0;
        for (int a = 0; a < str.length(); a++) {
            if (str.charAt(a) == '^')
                nPara++;
        }
        int nLength = str.length();
        int nWords = 0;
        int nUniqueWords = 0;
        String[] words = str.toLowerCase().split(" ");
        nWords = words.length;
        Set<String> u = new HashSet<String>();
        for (String w : words) {
            u.add(w);
        }
        nUniqueWords = u.size();

        extraStats.add(new DenseInstance(extraStats.numAttributes()));
        Instance extra = extraStats.lastInstance();
        extra.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            extra.setValue(1, color);
        }

        extra.setValue(extraStats.attribute("x_sent"), nSent);
        extra.setValue(extraStats.attribute("x_para"), nPara);
        extra.setValue(extraStats.attribute("x_length"), nLength);
        extra.setValue(extraStats.attribute("x_words"), nWords);
        extra.setValue(extraStats.attribute("x_unique_words"), nUniqueWords);

        if (y == null)
            extra.setValue(extraStats.numAttributes() - 1, Utils.missingValue());
        else
            extra.setValue(extraStats.numAttributes() - 1, y);

        //
        // POS tags
        //

        String tagsText = "";
        List<String> tagsList = tags.get(id);
        if (tagsList == null || tagsList.isEmpty()) {
            Job.log("WARNING", "no tags for " + id);
            tagsText = "x";
        } else {
            for (String tagsItem : tagsList) {
                tagsText += tagsItem;
            }
        }

        tagsData.add(new DenseInstance(ob.numAttributes()));
        Instance tagsOb = tagsData.lastInstance();
        tagsOb.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            tagsOb.setValue(1, color);
            tagsOb.setValue(2, tagsText.trim());
            if (y == null) {
                tagsOb.setValue(3, Utils.missingValue());
            } else {
                tagsOb.setValue(3, y);
            }
        } else {
            tagsOb.setValue(1, tagsText.trim());
            if (y == null) {
                tagsOb.setValue(2, Utils.missingValue());
            } else {
                tagsOb.setValue(2, y);
            }
        }

        //
        // Parse Tree
        //

        String treeText = "";
        List<String> treeList = parseTrees.get(id);
        if (treeList == null || treeList.isEmpty()) {
            Job.log("WARNING", "no parse tree for " + id);
            treeText = "x";
        } else {
            for (String treeItem : treeList) {
                treeText += treeItem;
            }
        }

        treeData.add(new DenseInstance(ob.numAttributes()));
        Instance treeOb = treeData.lastInstance();
        treeOb.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            treeOb.setValue(1, color);
            treeOb.setValue(2, treeText.trim());
            if (y == null) {
                treeOb.setValue(3, Utils.missingValue());
            } else {
                treeOb.setValue(3, y);
            }
        } else {
            treeOb.setValue(1, treeText.trim());
            if (y == null) {
                treeOb.setValue(2, Utils.missingValue());
            } else {
                treeOb.setValue(2, y);
            }
        }

        //
        // Depends data
        //

        for (int j = 0; j < 7; j++) {

            String text = "";
            List<String> list = depends.get(id);
            if (list == null || list.isEmpty()) {
                Job.log("WARNING", "no depends for " + id);
                text = "x";
            } else {
                for (String item : list) {
                    String[] term = StringUtils.safeSplit(item, "/", 3);
                    switch (j) {
                    case 0:
                        text += item;
                        break;
                    case 1:
                        text += term[1] + "/" + term[2];
                        break;
                    case 2:
                        text += term[0] + "/" + term[2];
                        break;
                    case 3:
                        text += term[0] + "/" + term[1];
                        break;
                    case 4:
                        text += term[0];
                        break;
                    case 5:
                        text += term[1];
                        break;
                    case 6:
                        text += term[2];
                        break;
                    }
                    text += " ";
                }
            }

            dependsData[j].add(new DenseInstance(ob.numAttributes()));
            Instance dependsOb = dependsData[j].lastInstance();
            dependsOb.setValue(0, id);
            if (Contest.isMultiChoice(essaySet)) {
                dependsOb.setValue(1, color);
                dependsOb.setValue(2, text.trim());
                if (y == null) {
                    dependsOb.setValue(3, Utils.missingValue());
                } else {
                    dependsOb.setValue(3, y);
                }
            } else {
                dependsOb.setValue(1, text.trim());
                if (y == null) {
                    dependsOb.setValue(2, Utils.missingValue());
                } else {
                    dependsOb.setValue(2, y);
                }
            }

        } // j
    } // dataset

    // Now save the new datasets

    Dataset.save("work/datasets/" + parent + "/" + tagsData.relationName(), tagsData);
    Dataset.save("work/datasets/" + parent + "/" + treeData.relationName(), treeData);
    for (int j = 0; j < 7; j++) {
        Dataset.save("work/datasets/" + parent + "/" + dependsData[j].relationName(), dependsData[j]);
    }
    Dataset.save("work/datasets/" + parent + "/" + extraStats.relationName(), extraStats);

}

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Builds the classifier./*  www .j a va 2 s  .c om*/
 */
private void buildClassifier() throws Exception {

    // Generate input format for classifier
    FastVector atts = new FastVector();
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (i == m_DocumentAtt) {
            atts.addElement(new Attribute("TFxIDF"));
            atts.addElement(new Attribute("First_occurrence"));
            if (m_KFused) {
                atts.addElement(new Attribute("Keyphrase_frequency"));
            }
        } else if (i == m_KeyphrasesAtt) {
            FastVector vals = new FastVector(2);
            vals.addElement("False");
            vals.addElement("True");
            atts.addElement(new Attribute("Keyphrase?", vals));
        }
    }
    m_ClassifierData = new Instances("ClassifierData", atts, 0);
    m_ClassifierData.setClassIndex(m_NumFeatures);

    if (m_Debug) {
        System.err.println("--- Converting instances for classifier");
    }

    // Convert pending input instances into data for classifier
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
        Instance current = getInputFormat().instance(i);

        // Get the key phrases for the document
        String keyphrases = current.stringValue(m_KeyphrasesAtt);
        HashMap hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        HashMap hashKeysEval = getGivenKeyphrases(keyphrases, true);

        // Get the phrases for the document
        HashMap hash = new HashMap();
        int length = getPhrases(hash, current.stringValue(m_DocumentAtt));

        // Compute the feature values for each phrase and
        // add the instance to the data for the classifier
        Iterator it = hash.keySet().iterator();
        while (it.hasNext()) {
            String phrase = (String) it.next();
            FastVector phraseInfo = (FastVector) hash.get(phrase);
            double[] vals = featVals(phrase, phraseInfo, true, hashKeysEval, hashKeyphrases, length);
            Instance inst = new Instance(current.weight(), vals);
            m_ClassifierData.add(inst);
        }
    }

    if (m_Debug) {
        System.err.println("--- Building classifier");
    }

    // Build classifier
    FilteredClassifier fclass = new FilteredClassifier();
    fclass.setClassifier(new NaiveBayesSimple());
    fclass.setFilter(new Discretize());
    m_Classifier = fclass;
    m_Classifier.buildClassifier(m_ClassifierData);

    if (m_Debug) {
        System.err.println(m_Classifier);
    }

    // Save space
    m_ClassifierData = new Instances(m_ClassifierData, 0);
}

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Converts an instance.//from   w  ww.  j a  va2s . c  o  m
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        System.err.println("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap hashKeyphrases = null;
    HashMap hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap hash = new HashMap();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }

    // Set indices of key attributes
    int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;

    // Go through the phrases and convert them into instances
    Iterator it = hash.keySet().iterator();
    while (it.hasNext()) {
        String phrase = (String) it.next();
        FastVector phraseInfo = (FastVector) hash.get(phrase);
        double[] vals = featVals(phrase, phraseInfo, training, hashKeysEval, hashKeyphrases, length);
        Instance inst = new Instance(instance.weight(), vals);
        inst.setDataset(m_ClassifierData);

        // Get probability of phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);
        double prob = probs[1];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                newInst[pos++] = index;

                // Add original version
                index = outputFormatPeek().attribute(pos).addStringValue((String) phraseInfo.elementAt(2));
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();
            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = (String) phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {

                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue((String) hashKeysEval.get(phrase));
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }
            }
            Instance inst = new Instance(instance.weight(), newInst);
            inst.setDataset(outputFormatPeek());
            vector.addElement(inst);
        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);

        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        String val = currentInstance.stringValue(phraseAttIndex);
        boolean foundSuperphrase = false;
        for (int j = startInd - 1; j >= 0; j--) {
            if (j != i) {
                Instance candidate = (Instance) vector.elementAt(j);
                String potSuperphrase = candidate.stringValue(phraseAttIndex);
                if (val.length() <= potSuperphrase.length()) {
                    if (KEAFilter.contains(val, potSuperphrase)) {
                        foundSuperphrase = true;
                        break;
                    }
                }
            }
        }
        if (foundSuperphrase) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
        } else {
            currentInstance.setValue(probsAttIndex + 1, rank++);
        }
    }
    return vector;
}

From source file:kea.KEAPhraseFilter.java

License:Open Source License

/** 
 * Converts an instance by removing all non-alphanumeric characters
 * from its string attribute values./* www.j  a v a2 s  .c om*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if (!instance.attribute(i).isString() || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            if (!m_SelectCols.isInRange(i)) {
                int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i));
                instVals[i] = (double) index;
                continue;
            }
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            int j = 0;
            boolean phraseStart = true;
            boolean seenNewLine = false;
            boolean haveSeenHyphen = false;
            boolean haveSeenSlash = false;
            while (j < str.length()) {
                boolean isWord = false;
                boolean potNumber = false;
                int startj = j;
                while (j < str.length()) {
                    char ch = str.charAt(j);
                    if (Character.isLetterOrDigit(ch)) {
                        potNumber = true;
                        if (Character.isLetter(ch)) {
                            isWord = true;
                        }
                        j++;
                    } else if ((!m_DisallowInternalPeriods && (ch == '.')) || (ch == '@') || (ch == '_')
                            || (ch == '&') || (ch == '/') || (ch == '-')) {
                        if ((j > 0) && (j + 1 < str.length()) && Character.isLetterOrDigit(str.charAt(j - 1))
                                && Character.isLetterOrDigit(str.charAt(j + 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else if (ch == '\'') {
                        if ((j > 0) && Character.isLetterOrDigit(str.charAt(j - 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }
                if (isWord == true) {
                    if (!phraseStart) {
                        if (haveSeenHyphen) {
                            resultStr.append('-');
                        } else if (haveSeenSlash) {
                            resultStr.append('/');
                        } else {
                            resultStr.append(' ');
                        }
                    }
                    resultStr.append(str.substring(startj, j));
                    if (j == str.length()) {
                        break;
                    }
                    phraseStart = false;
                    seenNewLine = false;
                    haveSeenHyphen = false;
                    haveSeenSlash = false;
                    if (Character.isWhitespace(str.charAt(j))) {
                        if (str.charAt(j) == '\n') {
                            seenNewLine = true;
                        }
                    } else if (str.charAt(j) == '-') {
                        haveSeenHyphen = true;
                    } else if (str.charAt(j) == '/') {
                        haveSeenSlash = true;
                    } else {
                        phraseStart = true;
                        resultStr.append('\n');
                    }
                    j++;
                } else if (j == str.length()) {
                    break;
                } else if (str.charAt(j) == '\n') {
                    if (seenNewLine) {
                        if (phraseStart == false) {
                            resultStr.append('\n');
                            phraseStart = true;
                        }
                    } else if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    seenNewLine = true;
                    j++;
                } else if (Character.isWhitespace(str.charAt(j))) {
                    if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    j++;
                } else {
                    if (phraseStart == false) {
                        resultStr.append('\n');
                        phraseStart = true;
                    }
                    j++;
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}