List of usage examples for weka.core Instance isMissing
public boolean isMissing(Attribute att);
From source file:wekimini.DataManager.java
public void setOutputValue(int index, int whichOutput, double val) { Instance i = allInstances.instance(index); if (i == null) { return;//from w ww. j a v a2 s . c o m } boolean changesNumberOfInstances = i.isMissing(numMetaData + numInputs + whichOutput); if (isDiscrete[whichOutput]) { int v = (int) val; Attribute a = i.attribute(numMetaData + numInputs + whichOutput); if (a.isNominal() && v >= 0 && v <= numClasses[whichOutput]) { i.setValue(numMetaData + numInputs + whichOutput, v); } else { logger.log(Level.SEVERE, "Attribute value out of range"); //TODO: CHeck this } } else { //TODO insert error checking / range limiting for this version! i.setValue(numMetaData + numInputs + whichOutput, val); } if (changesNumberOfInstances) { setNumExamplesPerOutput(whichOutput, getNumExamplesPerOutput(whichOutput) + 1); } }
From source file:wekimini.DataManager.java
public void setOutputMissing(int index, int outputNum) { //if (paramNum >= 0 && paramNum < numParams) { Instance i = allInstances.instance(index); if (!i.isMissing(numMetaData + numInputs + outputNum)) { i.setMissing(numMetaData + numInputs + outputNum); setNumExamplesPerOutput(outputNum, getNumExamplesPerOutput(outputNum) - 1); }// w ww .j a va2 s. c o m //Need to recompute numOutputs! //} }
From source file:wekimini.DataManager.java
public boolean isOutputMissing(int index, int outputNum) { Instance i = allInstances.instance(index); return (i.isMissing(numMetaData + numInputs + outputNum)); }
From source file:wekimini.DataManager.java
public double getOutputValue(int index, int whichOutput) { Instance i = allInstances.instance(index); if (i == null || i.numAttributes() < (numInputs + numMetaData + whichOutput)) { return Double.NaN; }//from w w w . j a v a 2 s . c o m if (i.isMissing(numMetaData + numInputs + whichOutput)) { return Double.NaN; } return i.value(numMetaData + numInputs + whichOutput); /* if (i.attribute(numMetaData + numInputs + whichOutput).isNumeric()) { return i.value(numMetaData + numInputs + whichOutput); } else { //What we need to do if we allow classes that don't start at 1: //return Double.parseDouble(i.attribute(numMetaData + numInputs + whichOutput).value((int)i.value(numMetaData + numInputs + whichOutput))); return i.value(numMetaData + numInputs + whichOutput) + 1; } */ }
From source file:wekimini.gui.WekiArffLoader.java
private void receivedConfiguration(int[] selectedIndices, boolean overwrite, boolean ignoreWithNoOutputs) { //Now load the data. TODO //For each instance: //Slow, not great, but should work: //addImportedData(double[] inputs, double[][] outputs, boolean[] inputMask, boolean[] outputMask) { //w.getSupervisedLearningManager().addBundleToTraining(null, outputs, recordingMask); //w.getDataManager().addToTraining(inputs, outputs, recordingMask, recordingRound); w.getSupervisedLearningManager().incrementRecordingRound(); if (overwrite) { w.getSupervisedLearningManager().deleteAllExamples(); }// w w w . j av a2 s . co m boolean[] inputMaskForSet = createInputMaskForSet(selectedIndices); boolean[] outputMaskForSet = createOutputMaskForSet(selectedIndices); try { //Get enumerator for instances... Instance nextInstance = af.getNextInstance(structure); int numInputs = inputMaskForSet.length; int numOutputs = outputMaskForSet.length; while (nextInstance != null) { double[] inputs = new double[inputMaskForSet.length]; double[] outputs = new double[outputMaskForSet.length]; boolean[] inputMask = new boolean[inputMaskForSet.length]; System.arraycopy(inputMaskForSet, 0, inputMask, 0, inputMask.length); boolean[] outputMask = new boolean[outputMaskForSet.length]; System.arraycopy(outputMaskForSet, 0, outputMask, 0, outputMask.length); int numOutputsMissing = 0; for (int i = 0; i < selectedIndices.length; i++) { int projectIndexForCol = projectIndicesPerColumn.get(i).get(selectedIndices[i]); //selectedIndices[i] : says which input/output corresponds to the ith attribute if (projectIndexForCol == 0) { //do nothing: ignore it } else if (projectIndexForCol <= inputs.length) { //it's an input if (nextInstance.isMissing(i)) { inputs[projectIndexForCol - 1] = 0; inputMask[projectIndexForCol - 1] = false; } else { inputs[projectIndexForCol - 1] = nextInstance.value(i); } } else { //it's an output if (nextInstance.isMissing(i)) { outputs[projectIndexForCol - 1 - numInputs] = 0; outputMask[projectIndexForCol - 1 - numInputs] = false; numOutputsMissing++; } else { double val = nextInstance.value(i); outputs[projectIndexForCol - 1 - numInputs] = val; } } } if (!ignoreWithNoOutputs || numOutputsMissing < numOutputs) { w.getSupervisedLearningManager().addToTraining(inputs, outputs, inputMask, outputMask); } nextInstance = af.getNextInstance(structure); } } catch (IOException ex) { w.getStatusUpdateCenter().warn(this, "Encountered error in reading from ARFF file."); Logger.getLogger(WekiArffLoader.class.getName()).log(Level.SEVERE, null, ex); recv.completed(); } //TODO: Prevent this from being available when in DTW mode. recv.completed(); }
From source file:zhaop.textmining.proj.MyStringToWordVector.java
License:Open Source License
/** * determines the dictionary.//from ww w. java 2 s. co m */ private void determineDictionary() { // initialize stopwords Stopwords stopwords = new Stopwords(); if (getUseStoplist()) { try { if (getStopwords().exists() && !getStopwords().isDirectory()) stopwords.read(getStopwords()); } catch (Exception e) { e.printStackTrace(); } } // Operate on a per-class basis if class attribute is set int classInd = getInputFormat().classIndex(); int values = 1; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { values = getInputFormat().attribute(classInd).numValues(); } // TreeMap dictionaryArr [] = new TreeMap[values]; TreeMap[] dictionaryArr = new TreeMap[values]; for (int i = 0; i < values; i++) { dictionaryArr[i] = new TreeMap(); } // Make sure we know which fields to convert determineSelectedRange(); // Tokenize all training text into an orderedMap of "words". long pruneRate = Math.round((m_PeriodicPruningRate / 100.0) * getInputFormat().numInstances()); for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance instance = getInputFormat().instance(i); int vInd = 0; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { vInd = (int) instance.classValue(); } // Iterate through all relevant string attributes of the current instance Hashtable h = new Hashtable(); for (int j = 0; j < instance.numAttributes(); j++) { if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { // Get tokenizer m_Tokenizer.tokenize(instance.stringValue(j)); // Iterate through tokens, perform stemming, and remove stopwords // (if required) while (m_Tokenizer.hasMoreElements()) { String word = ((String) m_Tokenizer.nextElement()).intern(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); // stop first. if (this.m_useStoplist == true) if (stopwords.is(word)) { // System.out.println("a stop word: " + word); continue; } // stem next word = m_Stemmer.stem(word); if (!(h.contains(word))) h.put(word, new Integer(0)); Count count = (Count) dictionaryArr[vInd].get(word); if (count == null) { dictionaryArr[vInd].put(word, new Count(1)); } else { count.count++; } } } } // updating the docCount for the words that have occurred in this // instance(document). Enumeration e = h.keys(); while (e.hasMoreElements()) { String word = (String) e.nextElement(); Count c = (Count) dictionaryArr[vInd].get(word); if (c != null) { c.docCount++; } else System.err.println( "Warning: A word should definitely be in the " + "dictionary.Please check the code"); } if (pruneRate > 0) { if (i % pruneRate == 0 && i > 0) { for (int z = 0; z < values; z++) { Vector d = new Vector(1000); Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); if (count.count <= 1) { d.add(word); } } Iterator iter = d.iterator(); while (iter.hasNext()) { String word = (String) iter.next(); dictionaryArr[z].remove(word); } } } } } // Figure out the minimum required word frequency int totalsize = 0; int prune[] = new int[values]; for (int z = 0; z < values; z++) { totalsize += dictionaryArr[z].size(); int array[] = new int[dictionaryArr[z].size()]; int pos = 0; Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); array[pos] = count.count; pos++; } // sort the array sortArray(array); if (array.length < m_WordsToKeep) { // if there aren't enough words, set the threshold to // minFreq prune[z] = m_minTermFreq; } else { // otherwise set it to be at least minFreq prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]); } } // Convert the dictionary into an attribute index // and create one attribute per word FastVector attributes = new FastVector(totalsize + getInputFormat().numAttributes()); // Add the non-converted attributes int classIndex = -1; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().classIndex() == i) { classIndex = attributes.size(); } attributes.addElement(getInputFormat().attribute(i).copy()); } } // Add the word vector attributes (eliminating duplicates // that occur in multiple classes) TreeMap newDictionary = new TreeMap(); int index = attributes.size(); for (int z = 0; z < values; z++) { Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); if (count.count >= prune[z]) { if (newDictionary.get(word) == null) { newDictionary.put(word, new Integer(index++)); attributes.addElement(new Attribute(m_Prefix + word)); } } } } // Compute document frequencies m_DocsCounts = new int[attributes.size()]; Iterator it = newDictionary.keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); int idx = ((Integer) newDictionary.get(word)).intValue(); int docsCount = 0; for (int j = 0; j < values; j++) { Count c = (Count) dictionaryArr[j].get(word); if (c != null) docsCount += c.docCount; } m_DocsCounts[idx] = docsCount; } // Trim vector and set instance variables attributes.trimToSize(); m_Dictionary = newDictionary; m_NumInstances = getInputFormat().numInstances(); // Set the filter's output format Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0); outputFormat.setClassIndex(classIndex); setOutputFormat(outputFormat); }
From source file:zhaop.textmining.proj.MyStringToWordVector.java
License:Open Source License
/** * Converts the instance w/o normalization. * /*from ww w. j a v a2 s .c om*/ * @oaram instance the instance to convert * @param v * @return the conerted instance */ private int convertInstancewoDocNorm(Instance instance, FastVector v) { // Convert the instance into a sorted set of indexes TreeMap contained = new TreeMap(); // Copy all non-converted attributes from input to output int firstCopy = 0; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { contained.put(new Integer(firstCopy), new Double(instance.value(i))); } } else { if (instance.isMissing(i)) { contained.put(new Integer(firstCopy), new Double(Instance.missingValue())); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(firstCopy).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(firstCopy) .addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(firstCopy) .addStringValue(instance.stringValue(i)); contained.put(new Integer(firstCopy), new Double(newIndex)); } } firstCopy++; } } for (int j = 0; j < instance.numAttributes(); j++) { // if ((getInputFormat().attribute(j).type() == Attribute.STRING) if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { m_Tokenizer.tokenize(instance.stringValue(j)); while (m_Tokenizer.hasMoreElements()) { String word = (String) m_Tokenizer.nextElement(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); word = m_Stemmer.stem(word); Integer index = (Integer) m_Dictionary.get(word); if (index != null) { if (m_OutputCounts) { // Separate if here rather than two lines down // to avoid hashtable lookup Double count = (Double) contained.get(index); if (count != null) { contained.put(index, new Double(count.doubleValue() + 1.0)); } else { contained.put(index, new Double(1)); } } else { contained.put(index, new Double(1)); } } } } } // Doing TFTransform if (m_TFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = Math.log(val + 1); contained.put(index, new Double(val)); } } } // Doing IDFTransform if (m_IDFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]); contained.put(index, new Double(val)); } } } // Convert the set to structures needed to create a sparse instance. double[] values = new double[contained.size()]; int[] indices = new int[contained.size()]; Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); Double value = (Double) contained.get(index); values[i] = value.doubleValue(); indices[i] = index.intValue(); } Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes()); inst.setDataset(outputFormatPeek()); v.addElement(inst); return firstCopy; }