Example usage for weka.core Instance isMissing

Introduction

In this page you can find the example usage for weka.core Instance isMissing.

Prototype

public boolean isMissing(Attribute att);

Source Link

Document

Tests if a specific value is "missing".

Usage

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Converts an instance.//w  w w .ja va 2  s.  com
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        System.err.println("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap hashKeyphrases = null;
    HashMap hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap hash = new HashMap();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }

    // Set indices of key attributes
    int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;

    // Go through the phrases and convert them into instances
    Iterator it = hash.keySet().iterator();
    while (it.hasNext()) {
        String phrase = (String) it.next();
        FastVector phraseInfo = (FastVector) hash.get(phrase);
        double[] vals = featVals(phrase, phraseInfo, training, hashKeysEval, hashKeyphrases, length);
        Instance inst = new Instance(instance.weight(), vals);
        inst.setDataset(m_ClassifierData);

        // Get probability of phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);
        double prob = probs[1];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                newInst[pos++] = index;

                // Add original version
                index = outputFormatPeek().attribute(pos).addStringValue((String) phraseInfo.elementAt(2));
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();
            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = (String) phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {

                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue((String) hashKeysEval.get(phrase));
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }
            }
            Instance inst = new Instance(instance.weight(), newInst);
            inst.setDataset(outputFormatPeek());
            vector.addElement(inst);
        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);

        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        String val = currentInstance.stringValue(phraseAttIndex);
        boolean foundSuperphrase = false;
        for (int j = startInd - 1; j >= 0; j--) {
            if (j != i) {
                Instance candidate = (Instance) vector.elementAt(j);
                String potSuperphrase = candidate.stringValue(phraseAttIndex);
                if (val.length() <= potSuperphrase.length()) {
                    if (KEAFilter.contains(val, potSuperphrase)) {
                        foundSuperphrase = true;
                        break;
                    }
                }
            }
        }
        if (foundSuperphrase) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
        } else {
            currentInstance.setValue(probsAttIndex + 1, rank++);
        }
    }
    return vector;
}

From source file:kea.KEAPhraseFilter.java

License:Open Source License

/** 
 * Converts an instance by removing all non-alphanumeric characters
 * from its string attribute values./*w ww . j ava  2s .c o  m*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if (!instance.attribute(i).isString() || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            if (!m_SelectCols.isInRange(i)) {
                int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i));
                instVals[i] = (double) index;
                continue;
            }
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            int j = 0;
            boolean phraseStart = true;
            boolean seenNewLine = false;
            boolean haveSeenHyphen = false;
            boolean haveSeenSlash = false;
            while (j < str.length()) {
                boolean isWord = false;
                boolean potNumber = false;
                int startj = j;
                while (j < str.length()) {
                    char ch = str.charAt(j);
                    if (Character.isLetterOrDigit(ch)) {
                        potNumber = true;
                        if (Character.isLetter(ch)) {
                            isWord = true;
                        }
                        j++;
                    } else if ((!m_DisallowInternalPeriods && (ch == '.')) || (ch == '@') || (ch == '_')
                            || (ch == '&') || (ch == '/') || (ch == '-')) {
                        if ((j > 0) && (j + 1 < str.length()) && Character.isLetterOrDigit(str.charAt(j - 1))
                                && Character.isLetterOrDigit(str.charAt(j + 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else if (ch == '\'') {
                        if ((j > 0) && Character.isLetterOrDigit(str.charAt(j - 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }
                if (isWord == true) {
                    if (!phraseStart) {
                        if (haveSeenHyphen) {
                            resultStr.append('-');
                        } else if (haveSeenSlash) {
                            resultStr.append('/');
                        } else {
                            resultStr.append(' ');
                        }
                    }
                    resultStr.append(str.substring(startj, j));
                    if (j == str.length()) {
                        break;
                    }
                    phraseStart = false;
                    seenNewLine = false;
                    haveSeenHyphen = false;
                    haveSeenSlash = false;
                    if (Character.isWhitespace(str.charAt(j))) {
                        if (str.charAt(j) == '\n') {
                            seenNewLine = true;
                        }
                    } else if (str.charAt(j) == '-') {
                        haveSeenHyphen = true;
                    } else if (str.charAt(j) == '/') {
                        haveSeenSlash = true;
                    } else {
                        phraseStart = true;
                        resultStr.append('\n');
                    }
                    j++;
                } else if (j == str.length()) {
                    break;
                } else if (str.charAt(j) == '\n') {
                    if (seenNewLine) {
                        if (phraseStart == false) {
                            resultStr.append('\n');
                            phraseStart = true;
                        }
                    } else if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    seenNewLine = true;
                    j++;
                } else if (Character.isWhitespace(str.charAt(j))) {
                    if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    j++;
                } else {
                    if (phraseStart == false) {
                        resultStr.append('\n');
                        phraseStart = true;
                    }
                    j++;
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:kea.NumbersFilter.java

License:Open Source License

/** 
 * Converts an instance. A phrase boundary is inserted where
 * a number is found.//from   w  w  w  .  ja  va  2 s.  c  o m
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if ((!instance.attribute(i).isString()) || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            StringTokenizer tok = new StringTokenizer(str, " \t\n", true);
            while (tok.hasMoreTokens()) {
                String token = tok.nextToken();

                // Everything that doesn't contain at least
                // one letter is considered to be a number
                boolean isNumber = true;
                for (int j = 0; j < token.length(); j++) {
                    if (Character.isLetter(token.charAt(j))) {
                        isNumber = false;
                        break;
                    }
                }
                if (!isNumber) {
                    resultStr.append(token);
                } else {
                    if (token.equals(" ") || token.equals("\t") || token.equals("\n")) {
                        resultStr.append(token);
                    } else {
                        resultStr.append(" \n ");
                    }
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:kmeans.MyKMeans.java

void updateCentroidForNominal(int numCentroid, int numAttr) {
    // System.out.println("Update centroid "+numCentroid+" attr "+dataSource.attribute(numAttr)+"|"+numAttr);
    int distinctValue = dataSource.attribute(numAttr).numValues();
    int[] countInst = new int[distinctValue];
    for (int i = 0; i < distinctValue; i++)
        countInst[i]++;//from ww  w .  j  av a2  s. c o  m
    Attribute attr = dataSource.attribute(numAttr);
    List<Integer> listInst = listClusteredInstance.get(numCentroid);
    //Mencari nilai attribut paling banyak dalam 1 cluster
    for (int i = 0; i < listInst.size(); i++) {
        Instance inst = dataSource.get(listInst.get(i));
        if (!inst.isMissing(attr)) {
            String attrValue = inst.toString(attr);
            int indexValue = attr.indexOfValue(attrValue);
            // System.out.println(inst+"|"+attrValue+"|"+indexValue);
            countInst[indexValue]++;
        }
    }
    int max = -1, idxMax = -1;
    for (int i = 0; i < distinctValue; i++) {
        if (countInst[i] > max) {
            idxMax = i;
            max = countInst[i];
        }
    }
    String newValue = attr.value(idxMax);
    Instance tempCentroid = centroid.get(numCentroid);
    tempCentroid.setValue(attr, newValue);
    centroid.set(numCentroid, tempCentroid);
}

From source file:lattice.Lattice.java

License:Open Source License

/**
 * Constructor of a lattice over the given variables of the dataset.
 * /*from   w w w  .  java 2s  .com*/
 * @param dataset
 */
public Lattice(Instances dataset) {

    // ~ initialise internal structure for counting (TID sets)
    this.nbInstances = dataset.numInstances();
    this.nbVariables = dataset.numAttributes();

    BitSet[][] presence = new BitSet[nbVariables][];

    TreeSet<Integer> allAttributesNumbers = new TreeSet<Integer>();
    int[] nbValuesForAttribute = new int[nbVariables];
    for (int a = 0; a < nbVariables; a++) {
        nbValuesForAttribute[a] = dataset.numDistinctValues(a) + 1; //+1 for missing
        presence[a] = new BitSet[nbValuesForAttribute[a]];
        allAttributesNumbers.add(a);
        for (int v = 0; v < presence[a].length; v++) {
            presence[a][v] = new BitSet();
        }
    }

    for (int i = 0; i < nbInstances; i++) {
        Instance row = dataset.instance(i);
        for (int a = 0; a < nbVariables; a++) {

            int indexOfValue;
            if (row.isMissing(a)) {
                //               indexOfValue = (int) dataset.meanOrMode(a);
                indexOfValue = dataset.numDistinctValues(a); //missing at the end
            } else {
                String value = row.stringValue(a);
                indexOfValue = row.attribute(a).indexOfValue(value);
            }
            presence[a][indexOfValue].set(i);

        }
    }

    // initialise the first nodes of the lattice (i.e., the ones
    // corresponding to single variables
    this.all = new LatticeNode(this, nbValuesForAttribute);
    this.singleNodes = new LatticeNode[nbVariables];
    for (int a = 0; a < nbVariables; a++) {
        int[] variablesNumbers = { a };
        LatticeNode node = new LatticeNode(this, variablesNumbers, nbValuesForAttribute, presence[a], all);
        singleNodes[a] = node;
    }

}

From source file:lattice.Lattice.java

License:Open Source License

public Lattice(Instances structure, ArffReader loader) throws IOException {
    // ~ initialise internal structure for counting (TID sets)
    this.nbInstances = 0;
    this.nbVariables = structure.numAttributes();

    BitSet[][] presence = new BitSet[nbVariables][];

    TreeSet<Integer> allAttributesNumbers = new TreeSet<Integer>();
    int[] nbValuesForAttribute = new int[nbVariables];
    for (int a = 0; a < nbVariables; a++) {
        nbValuesForAttribute[a] = structure.numDistinctValues(a) + 1;//+1 for missing
        presence[a] = new BitSet[nbValuesForAttribute[a]];
        allAttributesNumbers.add(a);/*from  ww  w  . j  av a2  s .  c  o m*/
        for (int v = 0; v < presence[a].length; v++) {
            presence[a][v] = new BitSet();
        }
    }

    Instance row;
    while ((row = loader.readInstance(structure)) != null) {
        for (int a = 0; a < nbVariables; a++) {
            int indexOfValue;
            if (row.isMissing(a)) {
                indexOfValue = structure.numDistinctValues(a);//missing at the end
            } else {
                String value = row.stringValue(a);
                indexOfValue = row.attribute(a).indexOfValue(value);
            }
            presence[a][indexOfValue].set(this.nbInstances);

        }
        this.nbInstances++;
    }

    // initialise the first nodes of the lattice (i.e., the ones
    // corresponding to single variables
    this.all = new LatticeNode(this, nbValuesForAttribute);
    this.singleNodes = new LatticeNode[nbVariables];
    for (int a = 0; a < nbVariables; a++) {
        int[] variablesNumbers = { a };
        LatticeNode node = new LatticeNode(this, variablesNumbers, nbValuesForAttribute, presence[a], all);
        singleNodes[a] = node;
    }
}

From source file:lu.lippmann.cdb.common.gui.ts.TimeSeriesChartUtil.java

License:Open Source License

private static void fillWithSingleAxis(final Instances dataSet, final int dateIdx,
        final TimeSeriesCollection tsDataset) {
    final int numInstances = dataSet.numInstances();

    final Calendar cal = Calendar.getInstance();
    for (final Integer i : WekaDataStatsUtil.getNumericAttributesIndexes(dataSet)) {
        if (dataSet.attributeStats(i).missingCount == dataSet.numInstances()) {
            System.out.println("TimeSeriesChartUtil: Only missing values for '" + dataSet.attribute(i).name()
                    + "', so skip it!");
            continue;
        }//from w  ww  . j a va  2 s  .  co m
        final TimeSeries ts = new TimeSeries(dataSet.attribute(i).name());
        for (int k = 0; k < numInstances; k++) {
            final Instance instancek = dataSet.instance(k);
            final long timeInMilliSec = (long) instancek.value(dateIdx);
            cal.setTimeInMillis(timeInMilliSec);

            if (instancek.isMissing(i)) {
                ts.addOrUpdate(new Millisecond(cal.getTime()), null);
            } else {
                ts.addOrUpdate(new Millisecond(cal.getTime()), instancek.value(i));
            }
        }
        if (!ts.isEmpty())
            tsDataset.addSeries(ts);
    }
}

From source file:lu.lippmann.cdb.common.gui.ts.TimeSeriesChartUtil.java

License:Open Source License

private static void fillWithSingleAxisInterval(final Instances dataSet, final int dateIdx,
        final YIntervalSeriesCollection tsDataset, final double deviation, final int deviatedAttrIdx) {
    final int numInstances = dataSet.numInstances();

    for (final Integer i : WekaDataStatsUtil.getNumericAttributesIndexes(dataSet)) {
        if (dataSet.attributeStats(i).missingCount == dataSet.numInstances()) {
            System.out.println("TimeSeriesChartUtil: Only missing values for '" + dataSet.attribute(i).name()
                    + "', so skip it!");
            continue;
        }/*from ww w.j  a  v a  2  s .c  o m*/
        final YIntervalSeries ts = new YIntervalSeries(dataSet.attribute(i).name());
        for (int k = 0; k < numInstances; k++) {
            final Instance instancek = dataSet.instance(k);
            final long timeInMilliSec = (long) instancek.value(dateIdx);

            if (instancek.isMissing(i)) {
                //ts.add(timeInMilliSec,null,0d,0d);               
            } else {
                if (i == deviatedAttrIdx && k > 0 && k < (numInstances - 1)) {
                    System.out.println(numInstances + " " + k + " " + instancek.value(i) + " "
                            + (instancek.value(i) - deviation) + " " + (instancek.value(i) + deviation));
                    ts.add(timeInMilliSec, instancek.value(i), instancek.value(i) - deviation,
                            instancek.value(i) + deviation);
                } else {
                    ts.add(timeInMilliSec, instancek.value(i), instancek.value(i), instancek.value(i));
                }
                //System.out.println(instancek.value(i)+" "+(instancek.value(i)-deviation)+" "+(instancek.value(i)+deviation));
            }
        }
        if (!ts.isEmpty())
            tsDataset.addSeries(ts);
    }
}

From source file:lu.lippmann.cdb.ext.hydviga.ui.GapFillingFrame.java

License:Open Source License

private Instances buildCorrectedDataset(final Instances diff) {
    //System.out.println("Build a corrected dataset ...");

    final Instances correctedDataSet = new Instances(dataSet);
    final int corrNumInstances = correctedDataSet.numInstances();

    final int diffNumInstances = diff.numInstances();
    final int diffNumAttributes = diff.numAttributes();

    final int idxInDiff = 0;

    for (int k = 0; k < diffNumInstances; k++) {
        final Instance diffInstanceK = diff.instance(k);

        if (diffInstanceK.isMissing(idxInDiff))
            continue;

        final long timestamp = (long) diffInstanceK.value(diffNumAttributes - 1);

        for (int h = 0; h < corrNumInstances; h++) {
            if ((long) correctedDataSet.instance(h).value(dateIdx) == timestamp) {
                correctedDataSet.instance(h).setValue(attr, diffInstanceK.value(idxInDiff));
                break;
            }//ww w  .j av  a  2s .  c  o  m
        }
    }

    //System.out.println("... corrected dataset built!");

    return correctedDataSet;
}

From source file:lu.lippmann.cdb.ext.hydviga.util.TransformTimeSeries.java

License:Open Source License

/**
 * Main method.//from w w  w.ja v  a  2  s.c  om
 * @param args command line arguments
 */
public static final void main(final String[] args) {
    try {
        final Instances dataSet = WekaDataAccessUtil.loadInstancesFromARFFOrCSVFile(new File("."
                + File.separatorChar + "data_fake" + File.separatorChar + "all_valid_q_series_complete2.arff"));
        System.out.println(dataSet.toSummaryString());

        final int numAttributes = dataSet.numAttributes();
        final int numInstances = dataSet.numInstances();
        for (int i = 0; i < numAttributes; i++) {
            final int i_bis = (int) (Math.random() * (double) (numAttributes - 3));
            final int i_tri = (int) (Math.random() * (double) (numAttributes - 3));

            for (int j = 0; j < numInstances; j++) {
                final Instance instance_j = dataSet.instance(j);

                if (instance_j.isMissing(i))
                    continue;
                if (instance_j.isMissing(i_bis))
                    continue;
                if (instance_j.isMissing(i_tri))
                    continue;

                final double iValue = instance_j.value(i);
                final double iBisValue = instance_j.value(i_bis);
                final double iTriValue = instance_j.value(i_tri);

                instance_j.setValue(i, (iValue + iBisValue + iTriValue));
            }
        }

        WekaDataAccessUtil.saveInstancesIntoARFFFile(dataSet, new File("." + File.separatorChar + "data_fake"
                + File.separatorChar + "all_valid_q_series_complete2_fake.arff"));
    } catch (final Exception e) {
        e.printStackTrace();
    }
}