Example usage for weka.core Instance stringValue

Introduction

In this page you can find the example usage for weka.core Instance stringValue.

Prototype

public String stringValue(Attribute att);

Source Link

Document

Returns the value of a nominal, string, date, or relational attribute for the instance as a string.

Usage

From source file:etc.aloe.filters.WordFeaturesExtractor.java

License:Open Source License

@Override
protected Instance process(Instance instance) throws Exception {
    if (selectedAttributeIndex < 0) {
        throw new IllegalStateException("String attribute not set");
    }//ww w. j  a  va 2  s .  c o  m

    int numOldValues = instance.numAttributes();
    int numNewFeatures = unigrams.size() + bigrams.size();
    double[] newValues = new double[numOldValues + numNewFeatures];

    // Copy all attributes from input to output
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (getInputFormat().attribute(i).type() != Attribute.STRING) {
            // Add simple nominal and numeric attributes directly
            if (instance.value(i) != 0.0) {
                newValues[i] = instance.value(i);
            }
        } else {
            if (instance.isMissing(i)) {
                newValues[i] = Utils.missingValue();
            } else {

                // If this is a string attribute, we have to first add
                // this value to the range of possible values, then add
                // its new internal index.
                if (outputFormatPeek().attribute(i).numValues() == 0) {
                    // Note that the first string value in a
                    // SparseInstance doesn't get printed.
                    outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug");
                }
                int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i));
                newValues[i] = newIndex;
            }
        }
    }

    String stringValue = instance.stringValue(selectedAttributeIndex);
    if (instance.isMissing(selectedAttributeIndex) == false) {

        List<String> words = tokenizeDocument(instance);
        Set<String> wordSet = new HashSet<String>(words);

        for (int i = 0; i < unigrams.size(); i++) {
            String unigram = unigrams.get(i);
            int count = 0;
            if (wordSet.contains(unigram)) {
                //Count the times the word is in the document
                for (int w = 0; w < words.size(); w++) {
                    if (words.get(w).equals(unigram)) {
                        count += 1;
                    }
                }
            }

            int featureIndex = numOldValues + i;
            newValues[featureIndex] = count;
        }

        for (int i = 0; i < bigrams.size(); i++) {
            Bigram bigram = bigrams.get(i);
            int count = bigram.getTimesInDocument(words);
            int featureIndex = numOldValues + unigrams.size() + i;
            newValues[featureIndex] = count;
        }
    }

    Instance result = new SparseInstance(instance.weight(), newValues);
    return result;
}

From source file:facebookpostpuller.RemoveEntriesGUI.java

private void btnDeletePostsActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_btnDeletePostsActionPerformed
    // TODO add your handling code here:
    String name = txtFacebookName.getText();
    btnDeletePosts.setVisible(false);// w w  w .j  a v a2  s.c om
    btnSaveArff.setVisible(false);
    btnLoadArff.setVisible(false);
    int ctr = 0;
    for (int i = data.numInstances() - 1; i >= 0; i--) {
        Instance inst = data.instance(i);
        if (inst.stringValue(0).equals(name)) {
            System.out.println("Deleted!");
            data.delete(i);
            ctr++;
        }
    }
    btnDeletePosts.setVisible(true);
    btnSaveArff.setVisible(true);
    btnLoadArff.setVisible(true);
    txtOutput.append(name + ": " + ctr + " posts deleted!\n");
}

From source file:filters.MauiFilter.java

License:Open Source License

private void selectCandidates() throws Exception {

    if (debugMode) {
        System.err.println("--- Computing candidates...");
    }// w  w  w .  ja va2 s . c om

    allCandidates = new HashMap<Instance, HashMap<String, Candidate>>();

    // Convert pending input instances into data for classifier
    int totalDocuments = getInputFormat().numInstances();
    for (int i = 0; i < totalDocuments; i++) {

        Instance current = getInputFormat().instance(i);

        String fileName = current.stringValue(fileNameAtt);
        int j = i + 1;
        if (debugMode) {
            System.err.println(
                    "---- Processing document " + fileName + ", " + j + " out of " + totalDocuments + "...");
        }

        // Get the phrases for the document
        String documentText = current.stringValue(documentAtt);

        HashMap<String, Candidate> candidateList = getCandidates(documentText);

        if (debugMode) {
            System.err.println("---- " + candidateList.size() + " candidates");
        }
        allCandidates.put(current, candidateList);
    }

}

From source file:filters.MauiFilter.java

License:Open Source License

/**
 * Builds the classifier.// w  w w. j av  a2  s .c om
 */
private void buildClassifier() throws Exception {

    // Generate input format for classifier
    FastVector atts = new FastVector();
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (i == documentAtt) {
            atts.addElement(new Attribute("Term_frequency")); // 2
            atts.addElement(new Attribute("IDF")); // 
            atts.addElement(new Attribute("TFxIDF")); // 
            atts.addElement(new Attribute("First_occurrence")); // 
            atts.addElement(new Attribute("Last_occurrence")); // 
            atts.addElement(new Attribute("Spread")); // 
            atts.addElement(new Attribute("Domain_keyphraseness")); // 
            atts.addElement(new Attribute("Length")); //
            atts.addElement(new Attribute("Generality")); //
            atts.addElement(new Attribute("Node_degree")); // 
            atts.addElement(new Attribute("Semantic_relatedness")); // 
            atts.addElement(new Attribute("Wikipedia_keyphraseness")); // 
            atts.addElement(new Attribute("Inverse_Wikip_frequency")); // 
            atts.addElement(new Attribute("Total_Wikip_keyphraseness")); // 13

        } else if (i == keyphrasesAtt) {
            if (nominalClassValue) {
                FastVector vals = new FastVector(2);
                vals.addElement("False");
                vals.addElement("True");
                atts.addElement(new Attribute("Keyphrase?", vals));
            } else {
                atts.addElement(new Attribute("Keyphrase?"));
            }
        }
    }

    classifierData = new Instances("ClassifierData", atts, 0);

    classifierData.setClassIndex(numFeatures);

    if (debugMode) {
        System.err.println("--- Converting instances for classifier");
    }
    int totalDocuments = getInputFormat().numInstances();
    // Convert pending input instances into data for classifier
    for (int i = 0; i < totalDocuments; i++) {
        Instance current = getInputFormat().instance(i);

        // Get the key phrases for the document
        String keyphrases = current.stringValue(keyphrasesAtt);
        HashMap<String, Counter> hashKeyphrases = getGivenKeyphrases(keyphrases);

        // Get the phrases for the document
        HashMap<String, Candidate> candidateList = allCandidates.get(current);

        // Compute the feature values for each phrase and
        // add the instance to the data for the classifier
        int countPos = 0;
        int countNeg = 0;

        if (debugMode) {
            System.err
                    .println("--- Computing features for document " + i + " out of " + totalDocuments + "...");
        }

        for (Candidate candidate : candidateList.values()) {

            // ignore all candidates that appear less than a threshold
            if (candidate.getFrequency() < minOccurFrequency) {
                continue;
            }

            // compute feature values
            double[] vals = computeFeatureValues(candidate, true, hashKeyphrases, candidateList);

            if (vals[vals.length - 1] == 0) {
                countNeg++;
            } else {
                countPos++;
            }
            Instance inst = new Instance(current.weight(), vals);
            // System.out.println(candidate + "\t" + inst);
            classifierData.add(inst);

        }
        if (debugMode) {
            System.err.println(countPos + " positive; " + countNeg + " negative instances");
        }
    }

    if (debugMode) {
        System.err.println("--- Building classifier");
    }

    if (classifier == null) {
        // Build classifier
        if (nominalClassValue) {

            //         FilteredClassifier fclass = new FilteredClassifier();
            //         fclass.setClassifier(new NaiveBayesSimple());
            //         fclass.setFilter(new Discretize());
            //         classifier = fclass;

            classifier = new Bagging(); // try also //
            classifier.setOptions(
                    Utils.splitOptions("-P 10 -S 1 -I 10 -W weka.classifiers.trees.J48 -- -U -M 2"));

        } else {

            classifier = new Bagging();
            // try also
            // classifier.setOptions(Utils.splitOptions("-P 10 -S 1 -I 10 -W
            // weka.classifiers.trees.J48 -- -U -M 2")) ;
            String optionsString = "-P 100 -S 1 -I 10 -W weka.classifiers.trees.M5P -- -U -M 7.0";
            String[] options = Utils.splitOptions(optionsString);
            classifier.setOptions(options);

        }
    }
    FileOutputStream out = new FileOutputStream(new File("19docs.arff"));
    PrintWriter printer = new PrintWriter(out);

    printer.write(classifierData.toString());

    printer.close();
    out.close();

    classifier.buildClassifier(classifierData);

    if (debugMode) {
        System.err.println(classifier);
    }

    // Save space
    classifierData = new Instances(classifierData, 0);
}

From source file:filters.MauiFilter.java

License:Open Source License

/**
 * Converts an instance.//www. java 2 s.  c o m
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    String fileName = instance.stringValue(fileNameAtt);

    if (debugMode) {
        System.err.println("-- Converting instance for document " + fileName);
    }

    // Get the key phrases for the document
    HashMap<String, Counter> hashKeyphrases = null;

    if (!instance.isMissing(keyphrasesAtt)) {
        String keyphrases = instance.stringValue(keyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases);
    }

    // Get the document text
    String documentText = instance.stringValue(documentAtt);

    // Compute the candidate topics
    HashMap<String, Candidate> candidateList;
    if (allCandidates != null && allCandidates.containsKey(instance)) {
        candidateList = allCandidates.get(instance);
    } else {
        candidateList = getCandidates(documentText);
    }
    if (debugMode) {
        System.err.println(candidateList.size() + " candidates ");
    }

    // Set indices for key attributes
    int tfidfAttIndex = documentAtt + 2;
    int distAttIndex = documentAtt + 3;
    int probsAttIndex = documentAtt + numFeatures;

    int countPos = 0;
    int countNeg = 0;

    // Go through the phrases and convert them into instances
    for (Candidate candidate : candidateList.values()) {

        if (candidate.getFrequency() < minOccurFrequency) {
            continue;
        }

        String name = candidate.getName();
        String orig = candidate.getBestFullForm();
        if (!vocabularyName.equals("none")) {
            orig = candidate.getTitle();
        }

        double[] vals = computeFeatureValues(candidate, training, hashKeyphrases, candidateList);

        Instance inst = new Instance(instance.weight(), vals);

        inst.setDataset(classifierData);

        // Get probability of a phrase being key phrase
        double[] probs = classifier.distributionForInstance(inst);

        double prob = probs[0];
        if (nominalClassValue) {
            prob = probs[1];
        }

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures + 2];

        int pos = 0;
        for (int i = 1; i < instance.numAttributes(); i++) {

            if (i == documentAtt) {

                // output of values for a given phrase:

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(name);
                newInst[pos++] = index;

                // Add original version
                if (orig != null) {
                    index = outputFormatPeek().attribute(pos).addStringValue(orig);
                } else {
                    index = outputFormatPeek().attribute(pos).addStringValue(name);
                }

                newInst[pos++] = index;

                // Add features
                newInst[pos++] = inst.value(tfIndex);
                newInst[pos++] = inst.value(idfIndex);
                newInst[pos++] = inst.value(tfidfIndex);
                newInst[pos++] = inst.value(firstOccurIndex);
                newInst[pos++] = inst.value(lastOccurIndex);
                newInst[pos++] = inst.value(spreadOccurIndex);
                newInst[pos++] = inst.value(domainKeyphIndex);
                newInst[pos++] = inst.value(lengthIndex);
                newInst[pos++] = inst.value(generalityIndex);
                newInst[pos++] = inst.value(nodeDegreeIndex);
                newInst[pos++] = inst.value(semRelIndex);
                newInst[pos++] = inst.value(wikipKeyphrIndex);
                newInst[pos++] = inst.value(invWikipFreqIndex);
                newInst[pos++] = inst.value(totalWikipKeyphrIndex);

                // Add probability
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();

            } else if (i == keyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }

        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);

        if (inst.classValue() == 0) {
            countNeg++;
        } else {
            countPos++;
        }
    }
    if (debugMode) {
        System.err.println(countPos + " positive; " + countNeg + " negative instances");
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);
        // Short cut: if phrase very unlikely make rank very low and
        // continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current
        // phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        currentInstance.setValue(probsAttIndex + 1, rank++);

    }

    return vector;
}

From source file:imba.classifier.NBTubes.java

@Override
public double[] distributionForInstance(Instance instance) throws Exception {
    //Fungsi ini menentukan probabilitas setiap kelas instance untuk instance 
    //yang ada di parameter fungsi
    Instances temp = null;// w ww  .  ja v  a  2 s  .  c o m
    Instance p;
    Filter f;
    double[] a = new double[infoClassifier.get(0).get(0).size()];
    int i, j, k, l, x, c;
    double t, prev;
    Enumeration n;
    boolean big;
    String val;
    String[] valMinMax;

    if (wasNumeric) {

        header_Instances.add(instance);

        f = new Normalize();
        try {
            f.setInputFormat(header_Instances);

            for (Instance i1 : header_Instances) {
                f.input(i1);
            }

            f.batchFinished();
        } catch (Exception ex) {
            Logger.getLogger(NBTubes.class.getName()).log(Level.SEVERE, null, ex);
        }

        temp = f.getOutputFormat();

        while ((p = f.output()) != null) {
            temp.add(p);
        }
    }

    f = new NumericToNominal();

    if (wasNumeric) {
        try {
            f.setInputFormat(temp);

            for (Instance i1 : temp) {
                f.input(i1);
            }

            f.batchFinished();
        } catch (Exception ex) {
            Logger.getLogger(NBTubes.class.getName()).log(Level.SEVERE, null, ex);
        }

        temp = null;
        temp = f.getOutputFormat();

        p = null;

        while ((p = f.output()) != null) {
            temp.add(p);
        }

        instance = temp.lastInstance();

        header_Instances.remove(header_Instances.size() - 1);
    } else {
        f.setInputFormat(header_Instances);

        f.input(instance);

        f.batchFinished();

        instance = f.output();
    }

    //Itung distribusi instance utk tiap kelas
    i = 0;
    while (i < (a.length)) {
        a[i] = (double) sumClass[i] / dataSize;

        j = 0;
        k = 0;
        while (j < infoClassifier.size()) {

            if (j == classIdx) {
                k++;
            }

            if (wasNumeric) {
                if (filter.equals("Discretize")) {
                    l = 0;
                    big = false;
                    while (l < dataset.attribute(k).numValues() && big == false) {
                        //parse
                        val = String.valueOf(dataset.attribute(k).value(l));
                        //System.out.println("k = " + k);
                        //System.out.println("nilai = " + instance.stringValue(k));
                        val = val.replaceAll("'", "");
                        val = val.replaceAll("\\(", "");
                        val = val.replaceAll("\\)", "");
                        val = val.replaceAll("]", "");

                        //System.out.println(val);

                        valMinMax = val.split("-");

                        //cocokin

                        if (valMinMax.length == 3) {
                            if (valMinMax[1].equals("inf")) {
                                valMinMax[1] = "0.0";
                            }
                            //System.out.println("Min = " + valMinMax[1]);
                            //System.out.println("Max = " + valMinMax[2]);
                            if (Double.valueOf(instance.stringValue(k)) > Double.valueOf(valMinMax[1]) && Double
                                    .valueOf(instance.stringValue(k)) <= Double.valueOf(valMinMax[2])) {
                                big = true;
                            }
                        } else {
                            if (valMinMax.length == 2) {
                                if (valMinMax[1].equals("inf")) {
                                    valMinMax[1] = "1.0";
                                }

                                if (Double.valueOf(instance.stringValue(k)) > Double.valueOf(valMinMax[0])
                                        && Double.valueOf(instance.stringValue(k)) <= Double
                                                .valueOf(valMinMax[1])) {
                                    big = true;
                                }
                            } else {
                                l = dataset.attribute(k).indexOfValue(instance.stringValue(k));
                                big = true;
                            }
                            //System.out.println("Min = " + valMinMax[0]);
                            //System.out.println("Max = " + valMinMax[1]);

                        }
                        l++;
                    }

                    x = l - 1;

                    //System.out.println("x = " + x);
                } else {
                    big = false;
                    l = 0;
                    n = dataset.attribute(k).enumerateValues();

                    t = 0;
                    prev = 0;
                    while (l < dataset.attribute(k).numValues() && big == false) {
                        t = Double.valueOf(n.nextElement().toString());

                        //System.out.println(prev + "   " + t);
                        if (Double.valueOf(instance.stringValue(k)) <= t) {
                            big = true;
                        } else {
                            prev = t;
                        }

                        l++;
                    }

                    if (big == true && t != Double.valueOf(instance.stringValue(k))) {
                        System.out.println(prev + "   " + Double.valueOf(instance.stringValue(k)) + "   " + t);
                    }
                    //x = l - 1;

                    if (classIdx < 2) {
                        c = 2;
                    } else {
                        c = 1;
                    }

                    if (big == true && l > c) {
                        if ((Double.valueOf(instance.stringValue(k)) - prev) <= (t
                                - Double.valueOf(instance.stringValue(k)))) {
                            x = l - 2;
                        } else {
                            x = l - 1;
                        }
                    } else {
                        x = l - 1;
                    }
                }

            } else {
                x = dataset.attribute(k).indexOfValue(instance.stringValue(k));
            }

            a[i] *= infoClassifier.get(j).get(x).get(i);

            k++;
            j++;
        }

        i++;
    }

    return a;
}

From source file:jjj.asap.sas.parser.job.ImportParserData.java

License:Open Source License

private void process(final String parent, int essaySet, Map<Double, List<String>> tags,
        Map<Double, List<String>> parseTrees, Map<Double, List<String>> depends) {

    // check if output exists
    boolean any = false;

    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-extra-stats.arff"))
        any = true;//  ww w. ja v  a2s  .  c om
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-pos-tags.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-parse-tree.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends0.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends1.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends2.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends3.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends4.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends5.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends6.arff"))
        any = true;

    if (!any) {
        Job.log("NOTE", "work/datasets/" + parent + "/" + essaySet
                + "-*.arff returns all required datasets - nothing to do");
        return;
    }

    // Load an existing dataset to use as a template.
    Instances dataset = Dataset.load("work/datasets/" + parent + "/" + essaySet + "-spell-checked.arff");

    // create the output datasets here. except for the extra statistics, 
    // the format is the same as 'dataset'.

    Instances tagsData = new Instances(dataset, 0);
    tagsData.setRelationName(essaySet + "-pos-tags.arff");
    Instances treeData = new Instances(dataset, 0);
    treeData.setRelationName(essaySet + "-parse-tree.arff");

    Instances dependsData[] = new Instances[7];
    for (int j = 0; j < 7; j++) {
        dependsData[j] = new Instances(dataset, 0);
        dependsData[j].setRelationName(essaySet + "-depends" + j + ".arff");
    }

    // extra stats
    DatasetBuilder builder = new DatasetBuilder();
    builder.addVariable("id");
    if (Contest.isMultiChoice(essaySet)) {
        builder.addNominalVariable("color", Contest.COLORS);
    }
    builder.addVariable("x_sent");
    builder.addVariable("x_para");
    builder.addVariable("x_length");
    builder.addVariable("x_words");
    builder.addVariable("x_unique_words");
    builder.addNominalVariable("score", Contest.getRubrics(essaySet));

    Instances extraStats = builder.getDataset(essaySet + "-extra-stats.arff");

    // now add rows for each instance

    for (int i = 0; i < dataset.numInstances(); i++) {

        // common variables
        Instance ob = dataset.instance(i);
        double id = ob.value(0);
        String y = ob.isMissing(dataset.numAttributes() - 1) ? null
                : ob.stringValue(dataset.numAttributes() - 1);
        String color = Contest.isMultiChoice(essaySet) ? ob.stringValue(dataset.attribute("color")) : null;
        String str = ob.stringValue(dataset.attribute("text"));

        //
        // Extra stats
        //

        int nSent = tags.containsKey(id) ? tags.get(id).size() : 0;
        int nPara = 0;
        for (int a = 0; a < str.length(); a++) {
            if (str.charAt(a) == '^')
                nPara++;
        }
        int nLength = str.length();
        int nWords = 0;
        int nUniqueWords = 0;
        String[] words = str.toLowerCase().split(" ");
        nWords = words.length;
        Set<String> u = new HashSet<String>();
        for (String w : words) {
            u.add(w);
        }
        nUniqueWords = u.size();

        extraStats.add(new DenseInstance(extraStats.numAttributes()));
        Instance extra = extraStats.lastInstance();
        extra.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            extra.setValue(1, color);
        }

        extra.setValue(extraStats.attribute("x_sent"), nSent);
        extra.setValue(extraStats.attribute("x_para"), nPara);
        extra.setValue(extraStats.attribute("x_length"), nLength);
        extra.setValue(extraStats.attribute("x_words"), nWords);
        extra.setValue(extraStats.attribute("x_unique_words"), nUniqueWords);

        if (y == null)
            extra.setValue(extraStats.numAttributes() - 1, Utils.missingValue());
        else
            extra.setValue(extraStats.numAttributes() - 1, y);

        //
        // POS tags
        //

        String tagsText = "";
        List<String> tagsList = tags.get(id);
        if (tagsList == null || tagsList.isEmpty()) {
            Job.log("WARNING", "no tags for " + id);
            tagsText = "x";
        } else {
            for (String tagsItem : tagsList) {
                tagsText += tagsItem;
            }
        }

        tagsData.add(new DenseInstance(ob.numAttributes()));
        Instance tagsOb = tagsData.lastInstance();
        tagsOb.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            tagsOb.setValue(1, color);
            tagsOb.setValue(2, tagsText.trim());
            if (y == null) {
                tagsOb.setValue(3, Utils.missingValue());
            } else {
                tagsOb.setValue(3, y);
            }
        } else {
            tagsOb.setValue(1, tagsText.trim());
            if (y == null) {
                tagsOb.setValue(2, Utils.missingValue());
            } else {
                tagsOb.setValue(2, y);
            }
        }

        //
        // Parse Tree
        //

        String treeText = "";
        List<String> treeList = parseTrees.get(id);
        if (treeList == null || treeList.isEmpty()) {
            Job.log("WARNING", "no parse tree for " + id);
            treeText = "x";
        } else {
            for (String treeItem : treeList) {
                treeText += treeItem;
            }
        }

        treeData.add(new DenseInstance(ob.numAttributes()));
        Instance treeOb = treeData.lastInstance();
        treeOb.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            treeOb.setValue(1, color);
            treeOb.setValue(2, treeText.trim());
            if (y == null) {
                treeOb.setValue(3, Utils.missingValue());
            } else {
                treeOb.setValue(3, y);
            }
        } else {
            treeOb.setValue(1, treeText.trim());
            if (y == null) {
                treeOb.setValue(2, Utils.missingValue());
            } else {
                treeOb.setValue(2, y);
            }
        }

        //
        // Depends data
        //

        for (int j = 0; j < 7; j++) {

            String text = "";
            List<String> list = depends.get(id);
            if (list == null || list.isEmpty()) {
                Job.log("WARNING", "no depends for " + id);
                text = "x";
            } else {
                for (String item : list) {
                    String[] term = StringUtils.safeSplit(item, "/", 3);
                    switch (j) {
                    case 0:
                        text += item;
                        break;
                    case 1:
                        text += term[1] + "/" + term[2];
                        break;
                    case 2:
                        text += term[0] + "/" + term[2];
                        break;
                    case 3:
                        text += term[0] + "/" + term[1];
                        break;
                    case 4:
                        text += term[0];
                        break;
                    case 5:
                        text += term[1];
                        break;
                    case 6:
                        text += term[2];
                        break;
                    }
                    text += " ";
                }
            }

            dependsData[j].add(new DenseInstance(ob.numAttributes()));
            Instance dependsOb = dependsData[j].lastInstance();
            dependsOb.setValue(0, id);
            if (Contest.isMultiChoice(essaySet)) {
                dependsOb.setValue(1, color);
                dependsOb.setValue(2, text.trim());
                if (y == null) {
                    dependsOb.setValue(3, Utils.missingValue());
                } else {
                    dependsOb.setValue(3, y);
                }
            } else {
                dependsOb.setValue(1, text.trim());
                if (y == null) {
                    dependsOb.setValue(2, Utils.missingValue());
                } else {
                    dependsOb.setValue(2, y);
                }
            }

        } // j
    } // dataset

    // Now save the new datasets

    Dataset.save("work/datasets/" + parent + "/" + tagsData.relationName(), tagsData);
    Dataset.save("work/datasets/" + parent + "/" + treeData.relationName(), treeData);
    for (int j = 0; j < 7; j++) {
        Dataset.save("work/datasets/" + parent + "/" + dependsData[j].relationName(), dependsData[j]);
    }
    Dataset.save("work/datasets/" + parent + "/" + extraStats.relationName(), extraStats);

}

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Builds the classifier./*  www .j a va 2 s  .c om*/
 */
private void buildClassifier() throws Exception {

    // Generate input format for classifier
    FastVector atts = new FastVector();
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (i == m_DocumentAtt) {
            atts.addElement(new Attribute("TFxIDF"));
            atts.addElement(new Attribute("First_occurrence"));
            if (m_KFused) {
                atts.addElement(new Attribute("Keyphrase_frequency"));
            }
        } else if (i == m_KeyphrasesAtt) {
            FastVector vals = new FastVector(2);
            vals.addElement("False");
            vals.addElement("True");
            atts.addElement(new Attribute("Keyphrase?", vals));
        }
    }
    m_ClassifierData = new Instances("ClassifierData", atts, 0);
    m_ClassifierData.setClassIndex(m_NumFeatures);

    if (m_Debug) {
        System.err.println("--- Converting instances for classifier");
    }

    // Convert pending input instances into data for classifier
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
        Instance current = getInputFormat().instance(i);

        // Get the key phrases for the document
        String keyphrases = current.stringValue(m_KeyphrasesAtt);
        HashMap hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        HashMap hashKeysEval = getGivenKeyphrases(keyphrases, true);

        // Get the phrases for the document
        HashMap hash = new HashMap();
        int length = getPhrases(hash, current.stringValue(m_DocumentAtt));

        // Compute the feature values for each phrase and
        // add the instance to the data for the classifier
        Iterator it = hash.keySet().iterator();
        while (it.hasNext()) {
            String phrase = (String) it.next();
            FastVector phraseInfo = (FastVector) hash.get(phrase);
            double[] vals = featVals(phrase, phraseInfo, true, hashKeysEval, hashKeyphrases, length);
            Instance inst = new Instance(current.weight(), vals);
            m_ClassifierData.add(inst);
        }
    }

    if (m_Debug) {
        System.err.println("--- Building classifier");
    }

    // Build classifier
    FilteredClassifier fclass = new FilteredClassifier();
    fclass.setClassifier(new NaiveBayesSimple());
    fclass.setFilter(new Discretize());
    m_Classifier = fclass;
    m_Classifier.buildClassifier(m_ClassifierData);

    if (m_Debug) {
        System.err.println(m_Classifier);
    }

    // Save space
    m_ClassifierData = new Instances(m_ClassifierData, 0);
}

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Converts an instance.//from   w  ww.  j a  va2s . c  o  m
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        System.err.println("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap hashKeyphrases = null;
    HashMap hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap hash = new HashMap();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }

    // Set indices of key attributes
    int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;

    // Go through the phrases and convert them into instances
    Iterator it = hash.keySet().iterator();
    while (it.hasNext()) {
        String phrase = (String) it.next();
        FastVector phraseInfo = (FastVector) hash.get(phrase);
        double[] vals = featVals(phrase, phraseInfo, training, hashKeysEval, hashKeyphrases, length);
        Instance inst = new Instance(instance.weight(), vals);
        inst.setDataset(m_ClassifierData);

        // Get probability of phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);
        double prob = probs[1];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                newInst[pos++] = index;

                // Add original version
                index = outputFormatPeek().attribute(pos).addStringValue((String) phraseInfo.elementAt(2));
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();
            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = (String) phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {

                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue((String) hashKeysEval.get(phrase));
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }
            }
            Instance inst = new Instance(instance.weight(), newInst);
            inst.setDataset(outputFormatPeek());
            vector.addElement(inst);
        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);

        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        String val = currentInstance.stringValue(phraseAttIndex);
        boolean foundSuperphrase = false;
        for (int j = startInd - 1; j >= 0; j--) {
            if (j != i) {
                Instance candidate = (Instance) vector.elementAt(j);
                String potSuperphrase = candidate.stringValue(phraseAttIndex);
                if (val.length() <= potSuperphrase.length()) {
                    if (KEAFilter.contains(val, potSuperphrase)) {
                        foundSuperphrase = true;
                        break;
                    }
                }
            }
        }
        if (foundSuperphrase) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
        } else {
            currentInstance.setValue(probsAttIndex + 1, rank++);
        }
    }
    return vector;
}

From source file:kea.KEAPhraseFilter.java

License:Open Source License

/** 
 * Converts an instance by removing all non-alphanumeric characters
 * from its string attribute values./* www.j  a v a2 s  .c om*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if (!instance.attribute(i).isString() || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            if (!m_SelectCols.isInRange(i)) {
                int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i));
                instVals[i] = (double) index;
                continue;
            }
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            int j = 0;
            boolean phraseStart = true;
            boolean seenNewLine = false;
            boolean haveSeenHyphen = false;
            boolean haveSeenSlash = false;
            while (j < str.length()) {
                boolean isWord = false;
                boolean potNumber = false;
                int startj = j;
                while (j < str.length()) {
                    char ch = str.charAt(j);
                    if (Character.isLetterOrDigit(ch)) {
                        potNumber = true;
                        if (Character.isLetter(ch)) {
                            isWord = true;
                        }
                        j++;
                    } else if ((!m_DisallowInternalPeriods && (ch == '.')) || (ch == '@') || (ch == '_')
                            || (ch == '&') || (ch == '/') || (ch == '-')) {
                        if ((j > 0) && (j + 1 < str.length()) && Character.isLetterOrDigit(str.charAt(j - 1))
                                && Character.isLetterOrDigit(str.charAt(j + 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else if (ch == '\'') {
                        if ((j > 0) && Character.isLetterOrDigit(str.charAt(j - 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }
                if (isWord == true) {
                    if (!phraseStart) {
                        if (haveSeenHyphen) {
                            resultStr.append('-');
                        } else if (haveSeenSlash) {
                            resultStr.append('/');
                        } else {
                            resultStr.append(' ');
                        }
                    }
                    resultStr.append(str.substring(startj, j));
                    if (j == str.length()) {
                        break;
                    }
                    phraseStart = false;
                    seenNewLine = false;
                    haveSeenHyphen = false;
                    haveSeenSlash = false;
                    if (Character.isWhitespace(str.charAt(j))) {
                        if (str.charAt(j) == '\n') {
                            seenNewLine = true;
                        }
                    } else if (str.charAt(j) == '-') {
                        haveSeenHyphen = true;
                    } else if (str.charAt(j) == '/') {
                        haveSeenSlash = true;
                    } else {
                        phraseStart = true;
                        resultStr.append('\n');
                    }
                    j++;
                } else if (j == str.length()) {
                    break;
                } else if (str.charAt(j) == '\n') {
                    if (seenNewLine) {
                        if (phraseStart == false) {
                            resultStr.append('\n');
                            phraseStart = true;
                        }
                    } else if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    seenNewLine = true;
                    j++;
                } else if (Character.isWhitespace(str.charAt(j))) {
                    if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    j++;
                } else {
                    if (phraseStart == false) {
                        resultStr.append('\n');
                        phraseStart = true;
                    }
                    j++;
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}