Example usage for weka.core Instance isMissing

Introduction

In this page you can find the example usage for weka.core Instance isMissing.

Prototype

public boolean isMissing(Attribute att);

Source Link

Document

Tests if a specific value is "missing".

Usage

From source file:wekimini.DataManager.java

public void setOutputValue(int index, int whichOutput, double val) {
    Instance i = allInstances.instance(index);
    if (i == null) {
        return;//from   w ww.  j a v  a2  s . c  o m
    }

    boolean changesNumberOfInstances = i.isMissing(numMetaData + numInputs + whichOutput);

    if (isDiscrete[whichOutput]) {
        int v = (int) val;
        Attribute a = i.attribute(numMetaData + numInputs + whichOutput);
        if (a.isNominal() && v >= 0 && v <= numClasses[whichOutput]) {
            i.setValue(numMetaData + numInputs + whichOutput, v);
        } else {
            logger.log(Level.SEVERE, "Attribute value out of range");
            //TODO: CHeck this
        }
    } else {
        //TODO insert error checking / range limiting for this version!
        i.setValue(numMetaData + numInputs + whichOutput, val);
    }
    if (changesNumberOfInstances) {
        setNumExamplesPerOutput(whichOutput, getNumExamplesPerOutput(whichOutput) + 1);
    }
}

From source file:wekimini.DataManager.java

public void setOutputMissing(int index, int outputNum) {
    //if (paramNum >= 0 && paramNum < numParams) {
    Instance i = allInstances.instance(index);
    if (!i.isMissing(numMetaData + numInputs + outputNum)) {
        i.setMissing(numMetaData + numInputs + outputNum);
        setNumExamplesPerOutput(outputNum, getNumExamplesPerOutput(outputNum) - 1);
    }//  w  ww .j a va2 s.  c o  m

    //Need to recompute numOutputs!
    //}
}

From source file:wekimini.DataManager.java

public boolean isOutputMissing(int index, int outputNum) {
    Instance i = allInstances.instance(index);
    return (i.isMissing(numMetaData + numInputs + outputNum));
}

From source file:wekimini.DataManager.java

public double getOutputValue(int index, int whichOutput) {
    Instance i = allInstances.instance(index);
    if (i == null || i.numAttributes() < (numInputs + numMetaData + whichOutput)) {
        return Double.NaN;
    }//from  w  w w  . j a v  a  2 s .  c  o  m
    if (i.isMissing(numMetaData + numInputs + whichOutput)) {
        return Double.NaN;
    }
    return i.value(numMetaData + numInputs + whichOutput);
    /* if (i.attribute(numMetaData + numInputs + whichOutput).isNumeric()) {
     return i.value(numMetaData + numInputs + whichOutput);
     } else {
     //What we need to do if we allow classes that don't start at 1:
     //return Double.parseDouble(i.attribute(numMetaData + numInputs + whichOutput).value((int)i.value(numMetaData + numInputs + whichOutput)));
     return i.value(numMetaData + numInputs + whichOutput) + 1;
     } */
}

From source file:wekimini.gui.WekiArffLoader.java

private void receivedConfiguration(int[] selectedIndices, boolean overwrite, boolean ignoreWithNoOutputs) {
    //Now load the data. TODO
    //For each instance: 

    //Slow, not great, but should work:
    //addImportedData(double[] inputs, double[][] outputs, boolean[] inputMask, boolean[] outputMask) {
    //w.getSupervisedLearningManager().addBundleToTraining(null, outputs, recordingMask);
    //w.getDataManager().addToTraining(inputs, outputs, recordingMask, recordingRound);
    w.getSupervisedLearningManager().incrementRecordingRound();

    if (overwrite) {
        w.getSupervisedLearningManager().deleteAllExamples();
    }// w w  w  . j av a2 s .  co  m

    boolean[] inputMaskForSet = createInputMaskForSet(selectedIndices);
    boolean[] outputMaskForSet = createOutputMaskForSet(selectedIndices);

    try {
        //Get enumerator for instances...
        Instance nextInstance = af.getNextInstance(structure);
        int numInputs = inputMaskForSet.length;
        int numOutputs = outputMaskForSet.length;

        while (nextInstance != null) {
            double[] inputs = new double[inputMaskForSet.length];
            double[] outputs = new double[outputMaskForSet.length];
            boolean[] inputMask = new boolean[inputMaskForSet.length];
            System.arraycopy(inputMaskForSet, 0, inputMask, 0, inputMask.length);
            boolean[] outputMask = new boolean[outputMaskForSet.length];
            System.arraycopy(outputMaskForSet, 0, outputMask, 0, outputMask.length);

            int numOutputsMissing = 0;
            for (int i = 0; i < selectedIndices.length; i++) {
                int projectIndexForCol = projectIndicesPerColumn.get(i).get(selectedIndices[i]);
                //selectedIndices[i] : says which input/output corresponds to the ith attribute
                if (projectIndexForCol == 0) {
                    //do nothing: ignore it
                } else if (projectIndexForCol <= inputs.length) { //it's an input
                    if (nextInstance.isMissing(i)) {
                        inputs[projectIndexForCol - 1] = 0;
                        inputMask[projectIndexForCol - 1] = false;
                    } else {
                        inputs[projectIndexForCol - 1] = nextInstance.value(i);
                    }
                } else { //it's an output
                    if (nextInstance.isMissing(i)) {
                        outputs[projectIndexForCol - 1 - numInputs] = 0;
                        outputMask[projectIndexForCol - 1 - numInputs] = false;
                        numOutputsMissing++;
                    } else {
                        double val = nextInstance.value(i);
                        outputs[projectIndexForCol - 1 - numInputs] = val;
                    }
                }
            }
            if (!ignoreWithNoOutputs || numOutputsMissing < numOutputs) {
                w.getSupervisedLearningManager().addToTraining(inputs, outputs, inputMask, outputMask);
            }
            nextInstance = af.getNextInstance(structure);
        }

    } catch (IOException ex) {
        w.getStatusUpdateCenter().warn(this, "Encountered error in reading from ARFF file.");
        Logger.getLogger(WekiArffLoader.class.getName()).log(Level.SEVERE, null, ex);
        recv.completed();
    }

    //TODO: Prevent this from being available when in DTW mode.
    recv.completed();
}

From source file:zhaop.textmining.proj.MyStringToWordVector.java

License:Open Source License

/**
 * determines the dictionary.//from ww w.  java 2 s. co m
 */
private void determineDictionary() {
    // initialize stopwords
    Stopwords stopwords = new Stopwords();
    if (getUseStoplist()) {
        try {
            if (getStopwords().exists() && !getStopwords().isDirectory())
                stopwords.read(getStopwords());
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    // Operate on a per-class basis if class attribute is set
    int classInd = getInputFormat().classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
        values = getInputFormat().attribute(classInd).numValues();
    }

    // TreeMap dictionaryArr [] = new TreeMap[values];
    TreeMap[] dictionaryArr = new TreeMap[values];
    for (int i = 0; i < values; i++) {
        dictionaryArr[i] = new TreeMap();
    }

    // Make sure we know which fields to convert
    determineSelectedRange();

    // Tokenize all training text into an orderedMap of "words".
    long pruneRate = Math.round((m_PeriodicPruningRate / 100.0) * getInputFormat().numInstances());
    for (int i = 0; i < getInputFormat().numInstances(); i++) {
        Instance instance = getInputFormat().instance(i);
        int vInd = 0;
        if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
            vInd = (int) instance.classValue();
        }

        // Iterate through all relevant string attributes of the current instance
        Hashtable h = new Hashtable();
        for (int j = 0; j < instance.numAttributes(); j++) {
            if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

                // Get tokenizer
                m_Tokenizer.tokenize(instance.stringValue(j));

                // Iterate through tokens, perform stemming, and remove stopwords
                // (if required)
                while (m_Tokenizer.hasMoreElements()) {
                    String word = ((String) m_Tokenizer.nextElement()).intern();

                    if (this.m_lowerCaseTokens == true)
                        word = word.toLowerCase();

                    // stop first.
                    if (this.m_useStoplist == true)
                        if (stopwords.is(word)) {
                            //                System.out.println("a stop word: " + word);
                            continue;
                        }

                    // stem next
                    word = m_Stemmer.stem(word);

                    if (!(h.contains(word)))
                        h.put(word, new Integer(0));

                    Count count = (Count) dictionaryArr[vInd].get(word);
                    if (count == null) {
                        dictionaryArr[vInd].put(word, new Count(1));
                    } else {
                        count.count++;
                    }
                }
            }
        }

        // updating the docCount for the words that have occurred in this
        // instance(document).
        Enumeration e = h.keys();
        while (e.hasMoreElements()) {
            String word = (String) e.nextElement();
            Count c = (Count) dictionaryArr[vInd].get(word);
            if (c != null) {
                c.docCount++;
            } else
                System.err.println(
                        "Warning: A word should definitely be in the " + "dictionary.Please check the code");
        }

        if (pruneRate > 0) {
            if (i % pruneRate == 0 && i > 0) {
                for (int z = 0; z < values; z++) {
                    Vector d = new Vector(1000);
                    Iterator it = dictionaryArr[z].keySet().iterator();
                    while (it.hasNext()) {
                        String word = (String) it.next();
                        Count count = (Count) dictionaryArr[z].get(word);
                        if (count.count <= 1) {
                            d.add(word);
                        }
                    }
                    Iterator iter = d.iterator();
                    while (iter.hasNext()) {
                        String word = (String) iter.next();
                        dictionaryArr[z].remove(word);
                    }
                }
            }
        }
    }

    // Figure out the minimum required word frequency
    int totalsize = 0;
    int prune[] = new int[values];
    for (int z = 0; z < values; z++) {
        totalsize += dictionaryArr[z].size();

        int array[] = new int[dictionaryArr[z].size()];
        int pos = 0;
        Iterator it = dictionaryArr[z].keySet().iterator();
        while (it.hasNext()) {
            String word = (String) it.next();
            Count count = (Count) dictionaryArr[z].get(word);
            array[pos] = count.count;
            pos++;
        }

        // sort the array
        sortArray(array);
        if (array.length < m_WordsToKeep) {
            // if there aren't enough words, set the threshold to
            // minFreq
            prune[z] = m_minTermFreq;
        } else {
            // otherwise set it to be at least minFreq
            prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]);
        }
    }

    // Convert the dictionary into an attribute index
    // and create one attribute per word
    FastVector attributes = new FastVector(totalsize + getInputFormat().numAttributes());

    // Add the non-converted attributes
    int classIndex = -1;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (!m_SelectedRange.isInRange(i)) {
            if (getInputFormat().classIndex() == i) {
                classIndex = attributes.size();
            }
            attributes.addElement(getInputFormat().attribute(i).copy());
        }
    }

    // Add the word vector attributes (eliminating duplicates
    // that occur in multiple classes)
    TreeMap newDictionary = new TreeMap();
    int index = attributes.size();
    for (int z = 0; z < values; z++) {
        Iterator it = dictionaryArr[z].keySet().iterator();
        while (it.hasNext()) {
            String word = (String) it.next();
            Count count = (Count) dictionaryArr[z].get(word);
            if (count.count >= prune[z]) {
                if (newDictionary.get(word) == null) {
                    newDictionary.put(word, new Integer(index++));
                    attributes.addElement(new Attribute(m_Prefix + word));
                }
            }
        }
    }

    // Compute document frequencies
    m_DocsCounts = new int[attributes.size()];
    Iterator it = newDictionary.keySet().iterator();
    while (it.hasNext()) {
        String word = (String) it.next();
        int idx = ((Integer) newDictionary.get(word)).intValue();
        int docsCount = 0;
        for (int j = 0; j < values; j++) {
            Count c = (Count) dictionaryArr[j].get(word);
            if (c != null)
                docsCount += c.docCount;
        }
        m_DocsCounts[idx] = docsCount;
    }

    // Trim vector and set instance variables
    attributes.trimToSize();
    m_Dictionary = newDictionary;
    m_NumInstances = getInputFormat().numInstances();

    // Set the filter's output format
    Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0);
    outputFormat.setClassIndex(classIndex);
    setOutputFormat(outputFormat);
}

From source file:zhaop.textmining.proj.MyStringToWordVector.java

License:Open Source License

/**
 * Converts the instance w/o normalization.
 * /*from   ww  w. j a v a2 s .c om*/
 * @oaram instance the instance to convert
 * @param v
 * @return the conerted instance
 */
private int convertInstancewoDocNorm(Instance instance, FastVector v) {

    // Convert the instance into a sorted set of indexes
    TreeMap contained = new TreeMap();

    // Copy all non-converted attributes from input to output
    int firstCopy = 0;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (!m_SelectedRange.isInRange(i)) {
            if (getInputFormat().attribute(i).type() != Attribute.STRING) {
                // Add simple nominal and numeric attributes directly
                if (instance.value(i) != 0.0) {
                    contained.put(new Integer(firstCopy), new Double(instance.value(i)));
                }
            } else {
                if (instance.isMissing(i)) {
                    contained.put(new Integer(firstCopy), new Double(Instance.missingValue()));
                } else {

                    // If this is a string attribute, we have to first add
                    // this value to the range of possible values, then add
                    // its new internal index.
                    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
                        // Note that the first string value in a
                        // SparseInstance doesn't get printed.
                        outputFormatPeek().attribute(firstCopy)
                                .addStringValue("Hack to defeat SparseInstance bug");
                    }
                    int newIndex = outputFormatPeek().attribute(firstCopy)
                            .addStringValue(instance.stringValue(i));
                    contained.put(new Integer(firstCopy), new Double(newIndex));
                }
            }
            firstCopy++;
        }
    }

    for (int j = 0; j < instance.numAttributes(); j++) {
        // if ((getInputFormat().attribute(j).type() == Attribute.STRING)
        if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

            m_Tokenizer.tokenize(instance.stringValue(j));

            while (m_Tokenizer.hasMoreElements()) {
                String word = (String) m_Tokenizer.nextElement();
                if (this.m_lowerCaseTokens == true)
                    word = word.toLowerCase();
                word = m_Stemmer.stem(word);
                Integer index = (Integer) m_Dictionary.get(word);
                if (index != null) {
                    if (m_OutputCounts) { // Separate if here rather than two lines down
                                          // to avoid hashtable lookup
                        Double count = (Double) contained.get(index);
                        if (count != null) {
                            contained.put(index, new Double(count.doubleValue() + 1.0));
                        } else {
                            contained.put(index, new Double(1));
                        }
                    } else {
                        contained.put(index, new Double(1));
                    }
                }
            }
        }
    }

    // Doing TFTransform
    if (m_TFTransform == true) {
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            if (index.intValue() >= firstCopy) {
                double val = ((Double) contained.get(index)).doubleValue();
                val = Math.log(val + 1);
                contained.put(index, new Double(val));
            }
        }
    }

    // Doing IDFTransform
    if (m_IDFTransform == true) {
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            if (index.intValue() >= firstCopy) {
                double val = ((Double) contained.get(index)).doubleValue();
                val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]);
                contained.put(index, new Double(val));
            }
        }
    }

    // Convert the set to structures needed to create a sparse instance.
    double[] values = new double[contained.size()];
    int[] indices = new int[contained.size()];
    Iterator it = contained.keySet().iterator();
    for (int i = 0; it.hasNext(); i++) {
        Integer index = (Integer) it.next();
        Double value = (Double) contained.get(index);
        values[i] = value.doubleValue();
        indices[i] = index.intValue();
    }

    Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes());
    inst.setDataset(outputFormatPeek());

    v.addElement(inst);

    return firstCopy;
}