Example usage for weka.core Instance isMissing

List of usage examples for weka.core Instance isMissing

Introduction

In this page you can find the example usage for weka.core Instance isMissing.

Prototype

public boolean isMissing(Attribute att);

Source Link

Document

Tests if a specific value is "missing".

Usage

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Converts an instance.//w  w w .ja va 2  s.  com
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        System.err.println("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap hashKeyphrases = null;
    HashMap hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap hash = new HashMap();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }

    // Set indices of key attributes
    int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;

    // Go through the phrases and convert them into instances
    Iterator it = hash.keySet().iterator();
    while (it.hasNext()) {
        String phrase = (String) it.next();
        FastVector phraseInfo = (FastVector) hash.get(phrase);
        double[] vals = featVals(phrase, phraseInfo, training, hashKeysEval, hashKeyphrases, length);
        Instance inst = new Instance(instance.weight(), vals);
        inst.setDataset(m_ClassifierData);

        // Get probability of phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);
        double prob = probs[1];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                newInst[pos++] = index;

                // Add original version
                index = outputFormatPeek().attribute(pos).addStringValue((String) phraseInfo.elementAt(2));
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();
            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = (String) phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {

                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue((String) hashKeysEval.get(phrase));
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }
            }
            Instance inst = new Instance(instance.weight(), newInst);
            inst.setDataset(outputFormatPeek());
            vector.addElement(inst);
        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);

        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        String val = currentInstance.stringValue(phraseAttIndex);
        boolean foundSuperphrase = false;
        for (int j = startInd - 1; j >= 0; j--) {
            if (j != i) {
                Instance candidate = (Instance) vector.elementAt(j);
                String potSuperphrase = candidate.stringValue(phraseAttIndex);
                if (val.length() <= potSuperphrase.length()) {
                    if (KEAFilter.contains(val, potSuperphrase)) {
                        foundSuperphrase = true;
                        break;
                    }
                }
            }
        }
        if (foundSuperphrase) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
        } else {
            currentInstance.setValue(probsAttIndex + 1, rank++);
        }
    }
    return vector;
}

From source file:kea.KEAPhraseFilter.java

License:Open Source License

/** 
 * Converts an instance by removing all non-alphanumeric characters
 * from its string attribute values./*w ww . j ava  2s .c o  m*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if (!instance.attribute(i).isString() || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            if (!m_SelectCols.isInRange(i)) {
                int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i));
                instVals[i] = (double) index;
                continue;
            }
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            int j = 0;
            boolean phraseStart = true;
            boolean seenNewLine = false;
            boolean haveSeenHyphen = false;
            boolean haveSeenSlash = false;
            while (j < str.length()) {
                boolean isWord = false;
                boolean potNumber = false;
                int startj = j;
                while (j < str.length()) {
                    char ch = str.charAt(j);
                    if (Character.isLetterOrDigit(ch)) {
                        potNumber = true;
                        if (Character.isLetter(ch)) {
                            isWord = true;
                        }
                        j++;
                    } else if ((!m_DisallowInternalPeriods && (ch == '.')) || (ch == '@') || (ch == '_')
                            || (ch == '&') || (ch == '/') || (ch == '-')) {
                        if ((j > 0) && (j + 1 < str.length()) && Character.isLetterOrDigit(str.charAt(j - 1))
                                && Character.isLetterOrDigit(str.charAt(j + 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else if (ch == '\'') {
                        if ((j > 0) && Character.isLetterOrDigit(str.charAt(j - 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }
                if (isWord == true) {
                    if (!phraseStart) {
                        if (haveSeenHyphen) {
                            resultStr.append('-');
                        } else if (haveSeenSlash) {
                            resultStr.append('/');
                        } else {
                            resultStr.append(' ');
                        }
                    }
                    resultStr.append(str.substring(startj, j));
                    if (j == str.length()) {
                        break;
                    }
                    phraseStart = false;
                    seenNewLine = false;
                    haveSeenHyphen = false;
                    haveSeenSlash = false;
                    if (Character.isWhitespace(str.charAt(j))) {
                        if (str.charAt(j) == '\n') {
                            seenNewLine = true;
                        }
                    } else if (str.charAt(j) == '-') {
                        haveSeenHyphen = true;
                    } else if (str.charAt(j) == '/') {
                        haveSeenSlash = true;
                    } else {
                        phraseStart = true;
                        resultStr.append('\n');
                    }
                    j++;
                } else if (j == str.length()) {
                    break;
                } else if (str.charAt(j) == '\n') {
                    if (seenNewLine) {
                        if (phraseStart == false) {
                            resultStr.append('\n');
                            phraseStart = true;
                        }
                    } else if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    seenNewLine = true;
                    j++;
                } else if (Character.isWhitespace(str.charAt(j))) {
                    if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    j++;
                } else {
                    if (phraseStart == false) {
                        resultStr.append('\n');
                        phraseStart = true;
                    }
                    j++;
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:kea.NumbersFilter.java

License:Open Source License

/** 
 * Converts an instance. A phrase boundary is inserted where
 * a number is found.//from   w  w  w  .  ja  va  2 s.  c  o m
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if ((!instance.attribute(i).isString()) || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            StringTokenizer tok = new StringTokenizer(str, " \t\n", true);
            while (tok.hasMoreTokens()) {
                String token = tok.nextToken();

                // Everything that doesn't contain at least
                // one letter is considered to be a number
                boolean isNumber = true;
                for (int j = 0; j < token.length(); j++) {
                    if (Character.isLetter(token.charAt(j))) {
                        isNumber = false;
                        break;
                    }
                }
                if (!isNumber) {
                    resultStr.append(token);
                } else {
                    if (token.equals(" ") || token.equals("\t") || token.equals("\n")) {
                        resultStr.append(token);
                    } else {
                        resultStr.append(" \n ");
                    }
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:kmeans.MyKMeans.java

void updateCentroidForNominal(int numCentroid, int numAttr) {
    // System.out.println("Update centroid "+numCentroid+" attr "+dataSource.attribute(numAttr)+"|"+numAttr);
    int distinctValue = dataSource.attribute(numAttr).numValues();
    int[] countInst = new int[distinctValue];
    for (int i = 0; i < distinctValue; i++)
        countInst[i]++;//from ww  w .  j  av a2  s. c o  m
    Attribute attr = dataSource.attribute(numAttr);
    List<Integer> listInst = listClusteredInstance.get(numCentroid);
    //Mencari nilai attribut paling banyak dalam 1 cluster
    for (int i = 0; i < listInst.size(); i++) {
        Instance inst = dataSource.get(listInst.get(i));
        if (!inst.isMissing(attr)) {
            String attrValue = inst.toString(attr);
            int indexValue = attr.indexOfValue(attrValue);
            // System.out.println(inst+"|"+attrValue+"|"+indexValue);
            countInst[indexValue]++;
        }
    }
    int max = -1, idxMax = -1;
    for (int i = 0; i < distinctValue; i++) {
        if (countInst[i] > max) {
            idxMax = i;
            max = countInst[i];
        }
    }
    String newValue = attr.value(idxMax);
    Instance tempCentroid = centroid.get(numCentroid);
    tempCentroid.setValue(attr, newValue);
    centroid.set(numCentroid, tempCentroid);
}

From source file:lattice.Lattice.java

License:Open Source License

/**
 * Constructor of a lattice over the given variables of the dataset.
 * /*from   w w w  .  java 2s  .com*/
 * @param dataset
 */
public Lattice(Instances dataset) {

    // ~ initialise internal structure for counting (TID sets)
    this.nbInstances = dataset.numInstances();
    this.nbVariables = dataset.numAttributes();

    BitSet[][] presence = new BitSet[nbVariables][];

    TreeSet<Integer> allAttributesNumbers = new TreeSet<Integer>();
    int[] nbValuesForAttribute = new int[nbVariables];
    for (int a = 0; a < nbVariables; a++) {
        nbValuesForAttribute[a] = dataset.numDistinctValues(a) + 1; //+1 for missing
        presence[a] = new BitSet[nbValuesForAttribute[a]];
        allAttributesNumbers.add(a);
        for (int v = 0; v < presence[a].length; v++) {
            presence[a][v] = new BitSet();
        }
    }

    for (int i = 0; i < nbInstances; i++) {
        Instance row = dataset.instance(i);
        for (int a = 0; a < nbVariables; a++) {

            int indexOfValue;
            if (row.isMissing(a)) {
                //               indexOfValue = (int) dataset.meanOrMode(a);
                indexOfValue = dataset.numDistinctValues(a); //missing at the end
            } else {
                String value = row.stringValue(a);
                indexOfValue = row.attribute(a).indexOfValue(value);
            }
            presence[a][indexOfValue].set(i);

        }
    }

    // initialise the first nodes of the lattice (i.e., the ones
    // corresponding to single variables
    this.all = new LatticeNode(this, nbValuesForAttribute);
    this.singleNodes = new LatticeNode[nbVariables];
    for (int a = 0; a < nbVariables; a++) {
        int[] variablesNumbers = { a };
        LatticeNode node = new LatticeNode(this, variablesNumbers, nbValuesForAttribute, presence[a], all);
        singleNodes[a] = node;
    }

}

From source file:lattice.Lattice.java

License:Open Source License

public Lattice(Instances structure, ArffReader loader) throws IOException {
    // ~ initialise internal structure for counting (TID sets)
    this.nbInstances = 0;
    this.nbVariables = structure.numAttributes();

    BitSet[][] presence = new BitSet[nbVariables][];

    TreeSet<Integer> allAttributesNumbers = new TreeSet<Integer>();
    int[] nbValuesForAttribute = new int[nbVariables];
    for (int a = 0; a < nbVariables; a++) {
        nbValuesForAttribute[a] = structure.numDistinctValues(a) + 1;//+1 for missing
        presence[a] = new BitSet[nbValuesForAttribute[a]];
        allAttributesNumbers.add(a);/*from  ww  w  . j  av a2  s .  c  o m*/
        for (int v = 0; v < presence[a].length; v++) {
            presence[a][v] = new BitSet();
        }
    }

    Instance row;
    while ((row = loader.readInstance(structure)) != null) {
        for (int a = 0; a < nbVariables; a++) {
            int indexOfValue;
            if (row.isMissing(a)) {
                indexOfValue = structure.numDistinctValues(a);//missing at the end
            } else {
                String value = row.stringValue(a);
                indexOfValue = row.attribute(a).indexOfValue(value);
            }
            presence[a][indexOfValue].set(this.nbInstances);

        }
        this.nbInstances++;
    }

    // initialise the first nodes of the lattice (i.e., the ones
    // corresponding to single variables
    this.all = new LatticeNode(this, nbValuesForAttribute);
    this.singleNodes = new LatticeNode[nbVariables];
    for (int a = 0; a < nbVariables; a++) {
        int[] variablesNumbers = { a };
        LatticeNode node = new LatticeNode(this, variablesNumbers, nbValuesForAttribute, presence[a], all);
        singleNodes[a] = node;
    }
}

From source file:lu.lippmann.cdb.common.gui.ts.TimeSeriesChartUtil.java

License:Open Source License

private static void fillWithSingleAxis(final Instances dataSet, final int dateIdx,
        final TimeSeriesCollection tsDataset) {
    final int numInstances = dataSet.numInstances();

    final Calendar cal = Calendar.getInstance();
    for (final Integer i : WekaDataStatsUtil.getNumericAttributesIndexes(dataSet)) {
        if (dataSet.attributeStats(i).missingCount == dataSet.numInstances()) {
            System.out.println("TimeSeriesChartUtil: Only missing values for '" + dataSet.attribute(i).name()
                    + "', so skip it!");
            continue;
        }//from w  ww  . j a va  2 s  .  co m
        final TimeSeries ts = new TimeSeries(dataSet.attribute(i).name());
        for (int k = 0; k < numInstances; k++) {
            final Instance instancek = dataSet.instance(k);
            final long timeInMilliSec = (long) instancek.value(dateIdx);
            cal.setTimeInMillis(timeInMilliSec);

            if (instancek.isMissing(i)) {
                ts.addOrUpdate(new Millisecond(cal.getTime()), null);
            } else {
                ts.addOrUpdate(new Millisecond(cal.getTime()), instancek.value(i));
            }
        }
        if (!ts.isEmpty())
            tsDataset.addSeries(ts);
    }
}

From source file:lu.lippmann.cdb.common.gui.ts.TimeSeriesChartUtil.java

License:Open Source License

private static void fillWithSingleAxisInterval(final Instances dataSet, final int dateIdx,
        final YIntervalSeriesCollection tsDataset, final double deviation, final int deviatedAttrIdx) {
    final int numInstances = dataSet.numInstances();

    for (final Integer i : WekaDataStatsUtil.getNumericAttributesIndexes(dataSet)) {
        if (dataSet.attributeStats(i).missingCount == dataSet.numInstances()) {
            System.out.println("TimeSeriesChartUtil: Only missing values for '" + dataSet.attribute(i).name()
                    + "', so skip it!");
            continue;
        }/*from ww w.j  a  v a  2  s .c  o m*/
        final YIntervalSeries ts = new YIntervalSeries(dataSet.attribute(i).name());
        for (int k = 0; k < numInstances; k++) {
            final Instance instancek = dataSet.instance(k);
            final long timeInMilliSec = (long) instancek.value(dateIdx);

            if (instancek.isMissing(i)) {
                //ts.add(timeInMilliSec,null,0d,0d);               
            } else {
                if (i == deviatedAttrIdx && k > 0 && k < (numInstances - 1)) {
                    System.out.println(numInstances + " " + k + " " + instancek.value(i) + " "
                            + (instancek.value(i) - deviation) + " " + (instancek.value(i) + deviation));
                    ts.add(timeInMilliSec, instancek.value(i), instancek.value(i) - deviation,
                            instancek.value(i) + deviation);
                } else {
                    ts.add(timeInMilliSec, instancek.value(i), instancek.value(i), instancek.value(i));
                }
                //System.out.println(instancek.value(i)+" "+(instancek.value(i)-deviation)+" "+(instancek.value(i)+deviation));
            }
        }
        if (!ts.isEmpty())
            tsDataset.addSeries(ts);
    }
}

From source file:lu.lippmann.cdb.ext.hydviga.ui.GapFillingFrame.java

License:Open Source License

private Instances buildCorrectedDataset(final Instances diff) {
    //System.out.println("Build a corrected dataset ...");

    final Instances correctedDataSet = new Instances(dataSet);
    final int corrNumInstances = correctedDataSet.numInstances();

    final int diffNumInstances = diff.numInstances();
    final int diffNumAttributes = diff.numAttributes();

    final int idxInDiff = 0;

    for (int k = 0; k < diffNumInstances; k++) {
        final Instance diffInstanceK = diff.instance(k);

        if (diffInstanceK.isMissing(idxInDiff))
            continue;

        final long timestamp = (long) diffInstanceK.value(diffNumAttributes - 1);

        for (int h = 0; h < corrNumInstances; h++) {
            if ((long) correctedDataSet.instance(h).value(dateIdx) == timestamp) {
                correctedDataSet.instance(h).setValue(attr, diffInstanceK.value(idxInDiff));
                break;
            }//ww w  .j av  a  2s .  c  o  m
        }
    }

    //System.out.println("... corrected dataset built!");

    return correctedDataSet;
}

From source file:lu.lippmann.cdb.ext.hydviga.util.TransformTimeSeries.java

License:Open Source License

/**
 * Main method.//from w w  w.ja v  a  2  s.c  om
 * @param args command line arguments
 */
public static final void main(final String[] args) {
    try {
        final Instances dataSet = WekaDataAccessUtil.loadInstancesFromARFFOrCSVFile(new File("."
                + File.separatorChar + "data_fake" + File.separatorChar + "all_valid_q_series_complete2.arff"));
        System.out.println(dataSet.toSummaryString());

        final int numAttributes = dataSet.numAttributes();
        final int numInstances = dataSet.numInstances();
        for (int i = 0; i < numAttributes; i++) {
            final int i_bis = (int) (Math.random() * (double) (numAttributes - 3));
            final int i_tri = (int) (Math.random() * (double) (numAttributes - 3));

            for (int j = 0; j < numInstances; j++) {
                final Instance instance_j = dataSet.instance(j);

                if (instance_j.isMissing(i))
                    continue;
                if (instance_j.isMissing(i_bis))
                    continue;
                if (instance_j.isMissing(i_tri))
                    continue;

                final double iValue = instance_j.value(i);
                final double iBisValue = instance_j.value(i_bis);
                final double iTriValue = instance_j.value(i_tri);

                instance_j.setValue(i, (iValue + iBisValue + iTriValue));
            }
        }

        WekaDataAccessUtil.saveInstancesIntoARFFFile(dataSet, new File("." + File.separatorChar + "data_fake"
                + File.separatorChar + "all_valid_q_series_complete2_fake.arff"));
    } catch (final Exception e) {
        e.printStackTrace();
    }
}