List of usage examples for weka.core Instance isMissing
public boolean isMissing(Attribute att);
From source file:kea.KEAFilter.java
License:Open Source License
/** * Converts an instance.//w w w .ja va 2 s. com */ private FastVector convertInstance(Instance instance, boolean training) throws Exception { FastVector vector = new FastVector(); if (m_Debug) { System.err.println("-- Converting instance"); } // Get the key phrases for the document HashMap hashKeyphrases = null; HashMap hashKeysEval = null; if (!instance.isMissing(m_KeyphrasesAtt)) { String keyphrases = instance.stringValue(m_KeyphrasesAtt); hashKeyphrases = getGivenKeyphrases(keyphrases, false); hashKeysEval = getGivenKeyphrases(keyphrases, true); } // Get the phrases for the document HashMap hash = new HashMap(); int length = getPhrases(hash, instance.stringValue(m_DocumentAtt)); // Compute number of extra attributes int numFeatures = 5; if (m_Debug) { if (m_KFused) { numFeatures = numFeatures + 1; } } // Set indices of key attributes int phraseAttIndex = m_DocumentAtt; int tfidfAttIndex = m_DocumentAtt + 2; int distAttIndex = m_DocumentAtt + 3; int probsAttIndex = m_DocumentAtt + numFeatures - 1; // Go through the phrases and convert them into instances Iterator it = hash.keySet().iterator(); while (it.hasNext()) { String phrase = (String) it.next(); FastVector phraseInfo = (FastVector) hash.get(phrase); double[] vals = featVals(phrase, phraseInfo, training, hashKeysEval, hashKeyphrases, length); Instance inst = new Instance(instance.weight(), vals); inst.setDataset(m_ClassifierData); // Get probability of phrase being key phrase double[] probs = m_Classifier.distributionForInstance(inst); double prob = probs[1]; // Compute attribute values for final instance double[] newInst = new double[instance.numAttributes() + numFeatures]; int pos = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == m_DocumentAtt) { // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(phrase); newInst[pos++] = index; // Add original version index = outputFormatPeek().attribute(pos).addStringValue((String) phraseInfo.elementAt(2)); newInst[pos++] = index; // Add TFxIDF newInst[pos++] = inst.value(m_TfidfIndex); // Add distance newInst[pos++] = inst.value(m_FirstOccurIndex); // Add other features if (m_Debug) { if (m_KFused) { newInst[pos++] = inst.value(m_KeyFreqIndex); } } // Add probability probsAttIndex = pos; newInst[pos++] = prob; // Set rank to missing (computed below) newInst[pos++] = Instance.missingValue(); } else if (i == m_KeyphrasesAtt) { newInst[pos++] = inst.classValue(); } else { newInst[pos++] = instance.value(i); } } Instance ins = new Instance(instance.weight(), newInst); ins.setDataset(outputFormatPeek()); vector.addElement(ins); } // Add dummy instances for keyphrases that don't occur // in the document if (hashKeysEval != null) { Iterator phrases = hashKeysEval.keySet().iterator(); while (phrases.hasNext()) { String phrase = (String) phrases.next(); double[] newInst = new double[instance.numAttributes() + numFeatures]; int pos = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == m_DocumentAtt) { // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(phrase); newInst[pos++] = (double) index; // Add original version index = outputFormatPeek().attribute(pos).addStringValue((String) hashKeysEval.get(phrase)); newInst[pos++] = (double) index; // Add TFxIDF newInst[pos++] = Instance.missingValue(); // Add distance newInst[pos++] = Instance.missingValue(); // Add other features if (m_Debug) { if (m_KFused) { newInst[pos++] = Instance.missingValue(); } } // Add probability and rank newInst[pos++] = -Double.MAX_VALUE; newInst[pos++] = Instance.missingValue(); } else if (i == m_KeyphrasesAtt) { newInst[pos++] = 1; // Keyphrase } else { newInst[pos++] = instance.value(i); } } Instance inst = new Instance(instance.weight(), newInst); inst.setDataset(outputFormatPeek()); vector.addElement(inst); } } // Sort phrases according to their distance (stable sort) double[] vals = new double[vector.size()]; for (int i = 0; i < vals.length; i++) { vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex); } FastVector newVector = new FastVector(vector.size()); int[] sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their tfxidf value (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their probability (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Compute rank of phrases. Check for subphrases that are ranked // lower than superphrases and assign probability -1 and set the // rank to Integer.MAX_VALUE int rank = 1; for (int i = 0; i < vals.length; i++) { Instance currentInstance = (Instance) vector.elementAt(i); // Short cut: if phrase very unlikely make rank very low and continue if (Utils.grOrEq(vals[i], 1.0)) { currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE); continue; } // Otherwise look for super phrase starting with first phrase // in list that has same probability, TFxIDF value, and distance as // current phrase. We do this to catch all superphrases // that have same probability, TFxIDF value and distance as current phrase. int startInd = i; while (startInd < vals.length) { Instance inst = (Instance) vector.elementAt(startInd); if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex)) || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex)) || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) { break; } startInd++; } String val = currentInstance.stringValue(phraseAttIndex); boolean foundSuperphrase = false; for (int j = startInd - 1; j >= 0; j--) { if (j != i) { Instance candidate = (Instance) vector.elementAt(j); String potSuperphrase = candidate.stringValue(phraseAttIndex); if (val.length() <= potSuperphrase.length()) { if (KEAFilter.contains(val, potSuperphrase)) { foundSuperphrase = true; break; } } } } if (foundSuperphrase) { currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE); } else { currentInstance.setValue(probsAttIndex + 1, rank++); } } return vector; }
From source file:kea.KEAPhraseFilter.java
License:Open Source License
/** * Converts an instance by removing all non-alphanumeric characters * from its string attribute values./*w ww . j ava 2s .c o m*/ */ private void convertInstance(Instance instance) throws Exception { double[] instVals = new double[instance.numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { if (!instance.attribute(i).isString() || instance.isMissing(i)) { instVals[i] = instance.value(i); } else { if (!m_SelectCols.isInRange(i)) { int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i)); instVals[i] = (double) index; continue; } String str = instance.stringValue(i); StringBuffer resultStr = new StringBuffer(); int j = 0; boolean phraseStart = true; boolean seenNewLine = false; boolean haveSeenHyphen = false; boolean haveSeenSlash = false; while (j < str.length()) { boolean isWord = false; boolean potNumber = false; int startj = j; while (j < str.length()) { char ch = str.charAt(j); if (Character.isLetterOrDigit(ch)) { potNumber = true; if (Character.isLetter(ch)) { isWord = true; } j++; } else if ((!m_DisallowInternalPeriods && (ch == '.')) || (ch == '@') || (ch == '_') || (ch == '&') || (ch == '/') || (ch == '-')) { if ((j > 0) && (j + 1 < str.length()) && Character.isLetterOrDigit(str.charAt(j - 1)) && Character.isLetterOrDigit(str.charAt(j + 1))) { j++; } else { break; } } else if (ch == '\'') { if ((j > 0) && Character.isLetterOrDigit(str.charAt(j - 1))) { j++; } else { break; } } else { break; } } if (isWord == true) { if (!phraseStart) { if (haveSeenHyphen) { resultStr.append('-'); } else if (haveSeenSlash) { resultStr.append('/'); } else { resultStr.append(' '); } } resultStr.append(str.substring(startj, j)); if (j == str.length()) { break; } phraseStart = false; seenNewLine = false; haveSeenHyphen = false; haveSeenSlash = false; if (Character.isWhitespace(str.charAt(j))) { if (str.charAt(j) == '\n') { seenNewLine = true; } } else if (str.charAt(j) == '-') { haveSeenHyphen = true; } else if (str.charAt(j) == '/') { haveSeenSlash = true; } else { phraseStart = true; resultStr.append('\n'); } j++; } else if (j == str.length()) { break; } else if (str.charAt(j) == '\n') { if (seenNewLine) { if (phraseStart == false) { resultStr.append('\n'); phraseStart = true; } } else if (potNumber) { if (phraseStart == false) { phraseStart = true; resultStr.append('\n'); } } seenNewLine = true; j++; } else if (Character.isWhitespace(str.charAt(j))) { if (potNumber) { if (phraseStart == false) { phraseStart = true; resultStr.append('\n'); } } j++; } else { if (phraseStart == false) { resultStr.append('\n'); phraseStart = true; } j++; } } int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString()); instVals[i] = (double) index; } } Instance inst = new Instance(instance.weight(), instVals); inst.setDataset(getOutputFormat()); push(inst); }
From source file:kea.NumbersFilter.java
License:Open Source License
/** * Converts an instance. A phrase boundary is inserted where * a number is found.//from w w w . ja va 2 s. c o m */ private void convertInstance(Instance instance) throws Exception { double[] instVals = new double[instance.numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { if ((!instance.attribute(i).isString()) || instance.isMissing(i)) { instVals[i] = instance.value(i); } else { String str = instance.stringValue(i); StringBuffer resultStr = new StringBuffer(); StringTokenizer tok = new StringTokenizer(str, " \t\n", true); while (tok.hasMoreTokens()) { String token = tok.nextToken(); // Everything that doesn't contain at least // one letter is considered to be a number boolean isNumber = true; for (int j = 0; j < token.length(); j++) { if (Character.isLetter(token.charAt(j))) { isNumber = false; break; } } if (!isNumber) { resultStr.append(token); } else { if (token.equals(" ") || token.equals("\t") || token.equals("\n")) { resultStr.append(token); } else { resultStr.append(" \n "); } } } int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString()); instVals[i] = (double) index; } } Instance inst = new Instance(instance.weight(), instVals); inst.setDataset(getOutputFormat()); push(inst); }
From source file:kmeans.MyKMeans.java
void updateCentroidForNominal(int numCentroid, int numAttr) { // System.out.println("Update centroid "+numCentroid+" attr "+dataSource.attribute(numAttr)+"|"+numAttr); int distinctValue = dataSource.attribute(numAttr).numValues(); int[] countInst = new int[distinctValue]; for (int i = 0; i < distinctValue; i++) countInst[i]++;//from ww w . j av a2 s. c o m Attribute attr = dataSource.attribute(numAttr); List<Integer> listInst = listClusteredInstance.get(numCentroid); //Mencari nilai attribut paling banyak dalam 1 cluster for (int i = 0; i < listInst.size(); i++) { Instance inst = dataSource.get(listInst.get(i)); if (!inst.isMissing(attr)) { String attrValue = inst.toString(attr); int indexValue = attr.indexOfValue(attrValue); // System.out.println(inst+"|"+attrValue+"|"+indexValue); countInst[indexValue]++; } } int max = -1, idxMax = -1; for (int i = 0; i < distinctValue; i++) { if (countInst[i] > max) { idxMax = i; max = countInst[i]; } } String newValue = attr.value(idxMax); Instance tempCentroid = centroid.get(numCentroid); tempCentroid.setValue(attr, newValue); centroid.set(numCentroid, tempCentroid); }
From source file:lattice.Lattice.java
License:Open Source License
/** * Constructor of a lattice over the given variables of the dataset. * /*from w w w . java 2s .com*/ * @param dataset */ public Lattice(Instances dataset) { // ~ initialise internal structure for counting (TID sets) this.nbInstances = dataset.numInstances(); this.nbVariables = dataset.numAttributes(); BitSet[][] presence = new BitSet[nbVariables][]; TreeSet<Integer> allAttributesNumbers = new TreeSet<Integer>(); int[] nbValuesForAttribute = new int[nbVariables]; for (int a = 0; a < nbVariables; a++) { nbValuesForAttribute[a] = dataset.numDistinctValues(a) + 1; //+1 for missing presence[a] = new BitSet[nbValuesForAttribute[a]]; allAttributesNumbers.add(a); for (int v = 0; v < presence[a].length; v++) { presence[a][v] = new BitSet(); } } for (int i = 0; i < nbInstances; i++) { Instance row = dataset.instance(i); for (int a = 0; a < nbVariables; a++) { int indexOfValue; if (row.isMissing(a)) { // indexOfValue = (int) dataset.meanOrMode(a); indexOfValue = dataset.numDistinctValues(a); //missing at the end } else { String value = row.stringValue(a); indexOfValue = row.attribute(a).indexOfValue(value); } presence[a][indexOfValue].set(i); } } // initialise the first nodes of the lattice (i.e., the ones // corresponding to single variables this.all = new LatticeNode(this, nbValuesForAttribute); this.singleNodes = new LatticeNode[nbVariables]; for (int a = 0; a < nbVariables; a++) { int[] variablesNumbers = { a }; LatticeNode node = new LatticeNode(this, variablesNumbers, nbValuesForAttribute, presence[a], all); singleNodes[a] = node; } }
From source file:lattice.Lattice.java
License:Open Source License
public Lattice(Instances structure, ArffReader loader) throws IOException { // ~ initialise internal structure for counting (TID sets) this.nbInstances = 0; this.nbVariables = structure.numAttributes(); BitSet[][] presence = new BitSet[nbVariables][]; TreeSet<Integer> allAttributesNumbers = new TreeSet<Integer>(); int[] nbValuesForAttribute = new int[nbVariables]; for (int a = 0; a < nbVariables; a++) { nbValuesForAttribute[a] = structure.numDistinctValues(a) + 1;//+1 for missing presence[a] = new BitSet[nbValuesForAttribute[a]]; allAttributesNumbers.add(a);/*from ww w . j av a2 s . c o m*/ for (int v = 0; v < presence[a].length; v++) { presence[a][v] = new BitSet(); } } Instance row; while ((row = loader.readInstance(structure)) != null) { for (int a = 0; a < nbVariables; a++) { int indexOfValue; if (row.isMissing(a)) { indexOfValue = structure.numDistinctValues(a);//missing at the end } else { String value = row.stringValue(a); indexOfValue = row.attribute(a).indexOfValue(value); } presence[a][indexOfValue].set(this.nbInstances); } this.nbInstances++; } // initialise the first nodes of the lattice (i.e., the ones // corresponding to single variables this.all = new LatticeNode(this, nbValuesForAttribute); this.singleNodes = new LatticeNode[nbVariables]; for (int a = 0; a < nbVariables; a++) { int[] variablesNumbers = { a }; LatticeNode node = new LatticeNode(this, variablesNumbers, nbValuesForAttribute, presence[a], all); singleNodes[a] = node; } }
From source file:lu.lippmann.cdb.common.gui.ts.TimeSeriesChartUtil.java
License:Open Source License
private static void fillWithSingleAxis(final Instances dataSet, final int dateIdx, final TimeSeriesCollection tsDataset) { final int numInstances = dataSet.numInstances(); final Calendar cal = Calendar.getInstance(); for (final Integer i : WekaDataStatsUtil.getNumericAttributesIndexes(dataSet)) { if (dataSet.attributeStats(i).missingCount == dataSet.numInstances()) { System.out.println("TimeSeriesChartUtil: Only missing values for '" + dataSet.attribute(i).name() + "', so skip it!"); continue; }//from w ww . j a va 2 s . co m final TimeSeries ts = new TimeSeries(dataSet.attribute(i).name()); for (int k = 0; k < numInstances; k++) { final Instance instancek = dataSet.instance(k); final long timeInMilliSec = (long) instancek.value(dateIdx); cal.setTimeInMillis(timeInMilliSec); if (instancek.isMissing(i)) { ts.addOrUpdate(new Millisecond(cal.getTime()), null); } else { ts.addOrUpdate(new Millisecond(cal.getTime()), instancek.value(i)); } } if (!ts.isEmpty()) tsDataset.addSeries(ts); } }
From source file:lu.lippmann.cdb.common.gui.ts.TimeSeriesChartUtil.java
License:Open Source License
private static void fillWithSingleAxisInterval(final Instances dataSet, final int dateIdx, final YIntervalSeriesCollection tsDataset, final double deviation, final int deviatedAttrIdx) { final int numInstances = dataSet.numInstances(); for (final Integer i : WekaDataStatsUtil.getNumericAttributesIndexes(dataSet)) { if (dataSet.attributeStats(i).missingCount == dataSet.numInstances()) { System.out.println("TimeSeriesChartUtil: Only missing values for '" + dataSet.attribute(i).name() + "', so skip it!"); continue; }/*from ww w.j a v a 2 s .c o m*/ final YIntervalSeries ts = new YIntervalSeries(dataSet.attribute(i).name()); for (int k = 0; k < numInstances; k++) { final Instance instancek = dataSet.instance(k); final long timeInMilliSec = (long) instancek.value(dateIdx); if (instancek.isMissing(i)) { //ts.add(timeInMilliSec,null,0d,0d); } else { if (i == deviatedAttrIdx && k > 0 && k < (numInstances - 1)) { System.out.println(numInstances + " " + k + " " + instancek.value(i) + " " + (instancek.value(i) - deviation) + " " + (instancek.value(i) + deviation)); ts.add(timeInMilliSec, instancek.value(i), instancek.value(i) - deviation, instancek.value(i) + deviation); } else { ts.add(timeInMilliSec, instancek.value(i), instancek.value(i), instancek.value(i)); } //System.out.println(instancek.value(i)+" "+(instancek.value(i)-deviation)+" "+(instancek.value(i)+deviation)); } } if (!ts.isEmpty()) tsDataset.addSeries(ts); } }
From source file:lu.lippmann.cdb.ext.hydviga.ui.GapFillingFrame.java
License:Open Source License
private Instances buildCorrectedDataset(final Instances diff) { //System.out.println("Build a corrected dataset ..."); final Instances correctedDataSet = new Instances(dataSet); final int corrNumInstances = correctedDataSet.numInstances(); final int diffNumInstances = diff.numInstances(); final int diffNumAttributes = diff.numAttributes(); final int idxInDiff = 0; for (int k = 0; k < diffNumInstances; k++) { final Instance diffInstanceK = diff.instance(k); if (diffInstanceK.isMissing(idxInDiff)) continue; final long timestamp = (long) diffInstanceK.value(diffNumAttributes - 1); for (int h = 0; h < corrNumInstances; h++) { if ((long) correctedDataSet.instance(h).value(dateIdx) == timestamp) { correctedDataSet.instance(h).setValue(attr, diffInstanceK.value(idxInDiff)); break; }//ww w .j av a 2s . c o m } } //System.out.println("... corrected dataset built!"); return correctedDataSet; }
From source file:lu.lippmann.cdb.ext.hydviga.util.TransformTimeSeries.java
License:Open Source License
/** * Main method.//from w w w.ja v a 2 s.c om * @param args command line arguments */ public static final void main(final String[] args) { try { final Instances dataSet = WekaDataAccessUtil.loadInstancesFromARFFOrCSVFile(new File("." + File.separatorChar + "data_fake" + File.separatorChar + "all_valid_q_series_complete2.arff")); System.out.println(dataSet.toSummaryString()); final int numAttributes = dataSet.numAttributes(); final int numInstances = dataSet.numInstances(); for (int i = 0; i < numAttributes; i++) { final int i_bis = (int) (Math.random() * (double) (numAttributes - 3)); final int i_tri = (int) (Math.random() * (double) (numAttributes - 3)); for (int j = 0; j < numInstances; j++) { final Instance instance_j = dataSet.instance(j); if (instance_j.isMissing(i)) continue; if (instance_j.isMissing(i_bis)) continue; if (instance_j.isMissing(i_tri)) continue; final double iValue = instance_j.value(i); final double iBisValue = instance_j.value(i_bis); final double iTriValue = instance_j.value(i_tri); instance_j.setValue(i, (iValue + iBisValue + iTriValue)); } } WekaDataAccessUtil.saveInstancesIntoARFFFile(dataSet, new File("." + File.separatorChar + "data_fake" + File.separatorChar + "all_valid_q_series_complete2_fake.arff")); } catch (final Exception e) { e.printStackTrace(); } }