List of usage examples for weka.core Instance classValue
public double classValue();
From source file:cn.edu.xjtu.dbmine.source.NaiveBayes.java
License:Open Source License
/** * Updates the classifier with the given instance. * * @param instance the new training instance to include in the model * @exception Exception if the instance could not be incorporated in * the model.//from w w w. jav a2 s . c o m */ public void updateClassifier(Instance instance) throws Exception { if (!instance.classIsMissing()) { Enumeration enumAtts = m_Instances.enumerateAttributes(); int attIndex = 0; while (enumAtts.hasMoreElements()) { Attribute attribute = (Attribute) enumAtts.nextElement(); if (!instance.isMissing(attribute)) { m_Distributions[attIndex][(int) instance.classValue()].addValue(instance.value(attribute), instance.weight()); } attIndex++; } m_ClassDistribution.addValue(instance.classValue(), instance.weight()); } }
From source file:cn.edu.xjtu.dbmine.StringToWordVector.java
License:Open Source License
/** * determines the dictionary./*ww w . j a v a 2 s .c o m*/ */ private void determineDictionary() { // initialize stopwords Stopwords stopwords = new Stopwords(); if (getUseStoplist()) { try { if (getStopwords().exists() && !getStopwords().isDirectory()) stopwords.read(getStopwords()); } catch (Exception e) { e.printStackTrace(); } } // Operate on a per-class basis if class attribute is set int classInd = getInputFormat().classIndex(); int values = 1; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { values = getInputFormat().attribute(classInd).numValues(); // System.out.println("number of class:"+getInputFormat().numClasses()+" "+getInputFormat().attribute(classInd).value(0)); } // TreeMap dictionaryArr [] = new TreeMap[values]; TreeMap[] dictionaryArr = new TreeMap[values]; for (int i = 0; i < values; i++) { dictionaryArr[i] = new TreeMap(); } // Make sure we know which fields to convert determineSelectedRange(); // Tokenize all training text into an orderedMap of "words". long pruneRate = Math.round((m_PeriodicPruningRate / 100.0) * getInputFormat().numInstances()); for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance instance = getInputFormat().instance(i); int vInd = 0; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { vInd = (int) instance.classValue(); } // Iterate through all relevant string attributes of the current // instance Hashtable h = new Hashtable(); for (int j = 0; j < instance.numAttributes(); j++) { if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { // Get tokenizer m_Tokenizer.tokenize(instance.stringValue(j)); // Iterate through tokens, perform stemming, and remove // stopwords // (if required) while (m_Tokenizer.hasMoreElements()) { String word = ((String) m_Tokenizer.nextElement()).intern(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); word = m_Stemmer.stem(word); if (this.m_useStoplist == true) if (stopwords.is(word)) continue; if (!(h.contains(word))) h.put(word, new Integer(0)); Count count = (Count) dictionaryArr[vInd].get(word); if (count == null) { dictionaryArr[vInd].put(word, new Count(1)); } else { count.count++; } } } } // updating the docCount for the words that have occurred in this // instance(document). Enumeration e = h.keys(); while (e.hasMoreElements()) { String word = (String) e.nextElement(); Count c = (Count) dictionaryArr[vInd].get(word); if (c != null) { c.docCount++; // c.doclist.add(vInd); } else System.err.println( "Warning: A word should definitely be in the " + "dictionary.Please check the code"); } if (pruneRate > 0) { if (i % pruneRate == 0 && i > 0) { for (int z = 0; z < values; z++) { Vector d = new Vector(1000); Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); if (count.count <= 1) { d.add(word); } } Iterator iter = d.iterator(); while (iter.hasNext()) { String word = (String) iter.next(); dictionaryArr[z].remove(word); } } } } } // Figure out the minimum required word frequency int totalsize = 0; int prune[] = new int[values]; for (int z = 0; z < values; z++) { totalsize += dictionaryArr[z].size(); int array[] = new int[dictionaryArr[z].size()]; int pos = 0; Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); array[pos] = count.count; pos++; } // sort the array sortArray(array); if (array.length < m_WordsToKeep) { // if there aren't enough words, set the threshold to // minFreq prune[z] = m_minTermFreq; } else { // otherwise set it to be at least minFreq prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]); } } // Convert the dictionary into an attribute index // and create one attribute per word FastVector attributes = new FastVector(totalsize + getInputFormat().numAttributes()); // Add the non-converted attributes int classIndex = -1; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().classIndex() == i) { classIndex = attributes.size(); } attributes.addElement(getInputFormat().attribute(i).copy()); } } // Add the word vector attributes (eliminating duplicates // that occur in multiple classes) TreeMap newDictionary = new TreeMap(); int index = attributes.size(); for (int z = 0; z < values; z++) { Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); if (count.count >= prune[z]) { if (newDictionary.get(word) == null) { newDictionary.put(word, new Integer(index++)); attributes.addElement(new Attribute(m_Prefix + word)); } } } } // Compute document frequencies m_DocsCounts = new int[attributes.size()]; Iterator it = newDictionary.keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); int idx = ((Integer) newDictionary.get(word)).intValue(); int docsCount = 0; for (int j = 0; j < values; j++) { Count c = (Count) dictionaryArr[j].get(word); if (c != null) docsCount += c.docCount; /* * if(!ctd.containsKey(j)){ Map<Integer,Integer> ma = new * HashMap<Integer,Integer>(); ctd.put(j, ma); } */ // if(ctd.get(j)==null) // ctd.get(j).put(idx, c); // int tt = ctd.get(j).get(idx); /* * for(int kk = 0;kk<c.doclist.size();kk++) { * //if(getInputFormat * ().instance(c.doclist.get(kk)).value(idx)>0) * ctd.get(j).put(idx, tt++); } */} m_DocsCounts[idx] = docsCount; } // Trim vector and set instance variables attributes.trimToSize(); m_Dictionary = newDictionary; m_NumInstances = getInputFormat().numInstances(); // Set the filter's output format Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0); outputFormat.setClassIndex(classIndex); setOutputFormat(outputFormat); }
From source file:cn.ict.zyq.bestConf.COMT2.COMT2.java
License:Open Source License
/** * @param samplePoint : some attributes are flexible; for such attributes, we use values of the samplepoint * @return/*w w w . j av a 2 s . c o m*/ * @throws Exception */ public Instance getInstanceWithPossibleMaxY(Instance samplePoint) throws Exception { Instance retval = null; //we actually have the model if (models != null) { ArrayList<Branch2>[] branchLists = new ArrayList[ModelNum]; for (int m = 0; m < ModelNum; m++) { branchLists[m] = getLeavesInfoForM5P(models[m]); } //now we intersect each leaf ArrayList<Branch2> combined = branchLists[0]; for (int m = 1; m < ModelNum; m++) { combined = intersectBranch2Lists(combined, branchLists[m]); } //now we find the best in the combined list Instance temp; for (Branch2 branch : combined) { temp = branch.maxPoint(samplePoint.dataset()); if (retval == null || retval.classValue() < temp.classValue()) { retval = temp; System.out.println("Current best performance is : " + retval.classValue()); } } } return retval; }
From source file:com.entopix.maui.filters.MauiFilter.java
License:Open Source License
/** * Converts an instance./*from w w w . j ava2 s. co m*/ */ private FastVector convertInstance(Instance instance, boolean training) { FastVector vector = new FastVector(); String fileName = instance.stringValue(fileNameAtt); if (debugMode) { log.info("-- Converting instance for document " + fileName); } // Get the key phrases for the document HashMap<String, Counter> hashKeyphrases = null; if (!instance.isMissing(keyphrasesAtt)) { String keyphrases = instance.stringValue(keyphrasesAtt); hashKeyphrases = getGivenKeyphrases(keyphrases); } // Get the document text String documentText = instance.stringValue(documentAtt); // Compute the candidate topics HashMap<String, Candidate> candidateList; if (allCandidates != null && allCandidates.containsKey(instance)) { candidateList = allCandidates.get(instance); } else { candidateList = getCandidates(documentText); } if (debugMode) { log.info(candidateList.size() + " candidates "); } // Set indices for key attributes int tfidfAttIndex = documentAtt + 2; int distAttIndex = documentAtt + 3; int probsAttIndex = documentAtt + numFeatures; int countPos = 0; int countNeg = 0; // Go through the phrases and convert them into instances for (Candidate candidate : candidateList.values()) { if (candidate.getFrequency() < minOccurFrequency) { continue; } String name = candidate.getName(); String orig = candidate.getBestFullForm(); if (!vocabularyName.equals("none")) { orig = candidate.getTitle(); } double[] vals = computeFeatureValues(candidate, training, hashKeyphrases, candidateList); Instance inst = new Instance(instance.weight(), vals); inst.setDataset(classifierData); double[] probs = null; try { // Get probability of a phrase being key phrase probs = classifier.distributionForInstance(inst); } catch (Exception e) { log.error("Exception while getting probability for candidate " + candidate.getName()); continue; } double prob = probs[0]; if (nominalClassValue) { prob = probs[1]; } // Compute attribute values for final instance double[] newInst = new double[instance.numAttributes() + numFeatures + 2]; int pos = 0; for (int i = 1; i < instance.numAttributes(); i++) { if (i == documentAtt) { // output of values for a given phrase: // 0 Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(name); newInst[pos++] = index; // 1 Add original version if (orig != null) { index = outputFormatPeek().attribute(pos).addStringValue(orig); } else { index = outputFormatPeek().attribute(pos).addStringValue(name); } // 2 newInst[pos++] = index; // Add features newInst[pos++] = inst.value(tfIndex); // 3 newInst[pos++] = inst.value(idfIndex); // 4 newInst[pos++] = inst.value(tfidfIndex); // 5 newInst[pos++] = inst.value(firstOccurIndex); // 6 newInst[pos++] = inst.value(lastOccurIndex); // 7 newInst[pos++] = inst.value(spreadOccurIndex); // 8 newInst[pos++] = inst.value(domainKeyphIndex); // 9 newInst[pos++] = inst.value(lengthIndex); // 10 newInst[pos++] = inst.value(generalityIndex); // 11 newInst[pos++] = inst.value(nodeDegreeIndex); // 12 newInst[pos++] = inst.value(invWikipFreqIndex); // 13 newInst[pos++] = inst.value(totalWikipKeyphrIndex); // 14 newInst[pos++] = inst.value(wikipGeneralityIndex); // 15 // Add probability probsAttIndex = pos; newInst[pos++] = prob; // 16 // Set rank to missing (computed below) newInst[pos++] = Instance.missingValue(); // 17 } else if (i == keyphrasesAtt) { newInst[pos++] = inst.classValue(); } else { newInst[pos++] = instance.value(i); } } Instance ins = new Instance(instance.weight(), newInst); ins.setDataset(outputFormatPeek()); vector.addElement(ins); if (inst.classValue() == 0) { countNeg++; } else { countPos++; } } if (debugMode) { log.info(countPos + " positive; " + countNeg + " negative instances"); } // Sort phrases according to their distance (stable sort) double[] vals = new double[vector.size()]; for (int i = 0; i < vals.length; i++) { vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex); } FastVector newVector = new FastVector(vector.size()); int[] sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their tfxidf value (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their probability (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Compute rank of phrases. Check for subphrases that are ranked // lower than superphrases and assign probability -1 and set the // rank to Integer.MAX_VALUE int rank = 1; for (int i = 0; i < vals.length; i++) { Instance currentInstance = (Instance) vector.elementAt(i); // log.info(vals[i] + "\t" + currentInstance); // Short cut: if phrase very unlikely make rank very low and // continue if (Utils.grOrEq(vals[i], 1.0)) { currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE); continue; } // Otherwise look for super phrase starting with first phrase // in list that has same probability, TFxIDF value, and distance as // current phrase. We do this to catch all superphrases // that have same probability, TFxIDF value and distance as current // phrase. int startInd = i; while (startInd < vals.length) { Instance inst = (Instance) vector.elementAt(startInd); if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex)) || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex)) || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) { break; } startInd++; } currentInstance.setValue(probsAttIndex + 1, rank++); } return vector; }
From source file:com.esda.util.StringToWordVector.java
License:Open Source License
/** * determines the dictionary./*from www .j a v a2s. co m*/ */ private void determineDictionary() { // initialize stopwords Stopwords stopwords = new Stopwords(); if (getUseStoplist()) { try { if (getStopwords().exists() && !getStopwords().isDirectory()) stopwords.read(getStopwords()); } catch (Exception e) { e.printStackTrace(); } } // Operate on a per-class basis if class attribute is set int classInd = getInputFormat().classIndex(); int values = 1; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { values = getInputFormat().attribute(classInd).numValues(); } // TreeMap dictionaryArr [] = new TreeMap[values]; TreeMap[] dictionaryArr = new TreeMap[values]; for (int i = 0; i < values; i++) { dictionaryArr[i] = new TreeMap(); } // Make sure we know which fields to convert determineSelectedRange(); // Tokenize all training text into an orderedMap of "words". long pruneRate = Math.round((m_PeriodicPruningRate / 100.0) * getInputFormat().numInstances()); for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance instance = getInputFormat().instance(i); int vInd = 0; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { vInd = (int) instance.classValue(); } // Iterate through all relevant string attributes of the current // instance Hashtable h = new Hashtable(); for (int j = 0; j < instance.numAttributes(); j++) { if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { // Get tokenizer m_Tokenizer.tokenize(instance.stringValue(j)); // Iterate through tokens, perform stemming, and remove // stopwords // (if required) while (m_Tokenizer.hasMoreElements()) { String word = ((String) m_Tokenizer.nextElement()).intern(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); String[] wordsArr = word.split(" "); StringBuilder stemmedStr = new StringBuilder(); for (String wordStr : wordsArr) { if (!this.m_useStoplist || !stopwords.is(wordStr)) { stemmedStr.append(m_Stemmer.stem(wordStr)); stemmedStr.append(" "); } } /*for (int icounter = 0; icounter < wordsArr.length; icounter++) { stemmedStr += m_Stemmer.stem(wordsArr[icounter]); if (icounter + 1 < wordsArr.length) stemmedStr += " "; }*/ word = stemmedStr.toString().trim(); if (!(h.containsKey(word))) h.put(word, new Integer(0)); Count count = (Count) dictionaryArr[vInd].get(word); if (count == null) { dictionaryArr[vInd].put(word, new Count(1)); } else { count.count++; } } } } // updating the docCount for the words that have occurred in this // instance(document). Enumeration e = h.keys(); while (e.hasMoreElements()) { String word = (String) e.nextElement(); Count c = (Count) dictionaryArr[vInd].get(word); if (c != null) { c.docCount++; } else System.err.println( "Warning: A word should definitely be in the " + "dictionary.Please check the code"); } if (pruneRate > 0) { if (i % pruneRate == 0 && i > 0) { for (int z = 0; z < values; z++) { Vector d = new Vector(1000); Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); if (count.count <= 1) { d.add(word); } } Iterator iter = d.iterator(); while (iter.hasNext()) { String word = (String) iter.next(); dictionaryArr[z].remove(word); } } } } } // Figure out the minimum required word frequency int totalsize = 0; int prune[] = new int[values]; for (int z = 0; z < values; z++) { totalsize += dictionaryArr[z].size(); int array[] = new int[dictionaryArr[z].size()]; int pos = 0; Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); array[pos] = count.count; pos++; } // sort the array sortArray(array); if (array.length < m_WordsToKeep) { // if there aren't enough words, set the threshold to // minFreq prune[z] = m_minTermFreq; } else { // otherwise set it to be at least minFreq prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]); } } // Convert the dictionary into an attribute index // and create one attribute per word FastVector attributes = new FastVector(totalsize + getInputFormat().numAttributes()); // Add the non-converted attributes int classIndex = -1; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().classIndex() == i) { classIndex = attributes.size(); } attributes.addElement(getInputFormat().attribute(i).copy()); } } // Add the word vector attributes (eliminating duplicates // that occur in multiple classes) TreeMap newDictionary = new TreeMap(); int index = attributes.size(); for (int z = 0; z < values; z++) { Iterator it = dictionaryArr[z].keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); Count count = (Count) dictionaryArr[z].get(word); if (count.count >= prune[z]) { if (newDictionary.get(word) == null) { newDictionary.put(word, new Integer(index++)); attributes.addElement(new Attribute(m_Prefix + word)); } } } } // Compute document frequencies m_DocsCounts = new int[attributes.size()]; Iterator it = newDictionary.keySet().iterator(); while (it.hasNext()) { String word = (String) it.next(); int idx = ((Integer) newDictionary.get(word)).intValue(); int docsCount = 0; for (int j = 0; j < values; j++) { Count c = (Count) dictionaryArr[j].get(word); if (c != null) docsCount += c.docCount; } m_DocsCounts[idx] = docsCount; } // Trim vector and set instance variables attributes.trimToSize(); m_Dictionary = newDictionary; m_NumInstances = getInputFormat().numInstances(); // Set the filter's output format Instances outputFormat = new Instances(getInputFormat().relationName(), attributes, 0); outputFormat.setClassIndex(classIndex); setOutputFormat(outputFormat); }
From source file:com.openkm.kea.filter.KEAFilter.java
License:Open Source License
/** * Converts an instance.// w ww . j a va 2s . c o m */ private FastVector convertInstance(Instance instance, boolean training) throws Exception { FastVector vector = new FastVector(); if (m_Debug) { log.info("-- Converting instance"); } // Get the key phrases for the document HashMap<String, Counter> hashKeyphrases = null; HashMap<String, Counter> hashKeysEval = null; if (!instance.isMissing(m_KeyphrasesAtt)) { String keyphrases = instance.stringValue(m_KeyphrasesAtt); hashKeyphrases = getGivenKeyphrases(keyphrases, false); hashKeysEval = getGivenKeyphrases(keyphrases, true); } // Get the phrases for the document HashMap<String, FastVector> hash = new HashMap<String, FastVector>(); int length = getPhrases(hash, instance.stringValue(m_DocumentAtt)); // hash = getComposits(hash); /* Experimental: To compute how many of the manual keyphrases appear in the documents: log.info("Doc phrases found " + hash.size()); log.info("Manual keyphrases: "); Iterator iter = hashKeyphrases.keySet().iterator(); int count = 0; while (iter.hasNext()) { String id = (String)iter.next(); if (hash.containsKey(id)) { count++; } } double max_recall = (double)count/(double)hashKeyphrases.size(); m_max_recall += max_recall; doc++; double avg_m_max_recall = m_max_recall/(double)doc; String file = instance.stringValue(2); log.info(count + " out of " + hashKeyphrases.size() + " are in the document "); log.info("Max recall : " + avg_m_max_recall + " on " + doc + " documents "); */ // Compute number of extra attributes int numFeatures = 5; if (m_Debug) { if (m_KFused) { numFeatures = numFeatures + 1; } } if (m_STDEVfeature) { numFeatures = numFeatures + 1; } if (m_NODEfeature) { numFeatures = numFeatures + 1; } if (m_LENGTHfeature) { numFeatures = numFeatures + 1; } // Set indices of key attributes //int phraseAttIndex = m_DocumentAtt; int tfidfAttIndex = m_DocumentAtt + 2; int distAttIndex = m_DocumentAtt + 3; int probsAttIndex = m_DocumentAtt + numFeatures - 1; //int classAttIndex = numFeatures; // Go through the phrases and convert them into instances Iterator<String> it = hash.keySet().iterator(); while (it.hasNext()) { String id = it.next(); FastVector phraseInfo = (FastVector) hash.get(id); double[] vals = featVals(id, phraseInfo, training, hashKeysEval, hashKeyphrases, length, hash); Instance inst = new Instance(instance.weight(), vals); inst.setDataset(m_ClassifierData); // Get probability of a phrase being key phrase double[] probs = m_Classifier.distributionForInstance(inst); // If simple Naive Bayes used, change here to //double prob = probs[1]; double prob = probs[0]; // Compute attribute values for final instance double[] newInst = new double[instance.numAttributes() + numFeatures]; int pos = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == m_DocumentAtt) { // output of values for a given phrase: // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(id); newInst[pos++] = index; // Add original version String orig = (String) phraseInfo.elementAt(2); if (orig != null) { index = outputFormatPeek().attribute(pos).addStringValue(orig); } else { index = outputFormatPeek().attribute(pos).addStringValue(id); } newInst[pos++] = index; // Add TFxIDF newInst[pos++] = inst.value(m_TfidfIndex); // Add distance newInst[pos++] = inst.value(m_FirstOccurIndex); // Add other features if (m_Debug) { if (m_KFused) { newInst[pos++] = inst.value(m_KeyFreqIndex); } } if (m_STDEVfeature) { newInst[pos++] = inst.value(m_STDEVIndex); } if (m_NODEfeature) { newInst[pos++] = inst.value(m_NodeIndex); } if (m_LENGTHfeature) { newInst[pos++] = inst.value(m_LengthIndex); } // Add probability probsAttIndex = pos; newInst[pos++] = prob; // Set rank to missing (computed below) newInst[pos++] = Instance.missingValue(); } else if (i == m_KeyphrasesAtt) { newInst[pos++] = inst.classValue(); } else { newInst[pos++] = instance.value(i); } } Instance ins = new Instance(instance.weight(), newInst); ins.setDataset(outputFormatPeek()); vector.addElement(ins); } // Add dummy instances for keyphrases that don't occur // in the document if (hashKeysEval != null) { Iterator<String> phrases = hashKeysEval.keySet().iterator(); while (phrases.hasNext()) { String phrase = phrases.next(); double[] newInst = new double[instance.numAttributes() + numFeatures]; int pos = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == m_DocumentAtt) { // log.info("Here: " + phrase); // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(phrase); newInst[pos++] = (double) index; // Add original version index = outputFormatPeek().attribute(pos).addStringValue(phrase); newInst[pos++] = (double) index; // Add TFxIDF newInst[pos++] = Instance.missingValue(); // Add distance newInst[pos++] = Instance.missingValue(); // Add other features if (m_Debug) { if (m_KFused) { newInst[pos++] = Instance.missingValue(); } } if (m_STDEVfeature) { newInst[pos++] = Instance.missingValue(); } if (m_NODEfeature) { newInst[pos++] = Instance.missingValue(); } if (m_LENGTHfeature) { newInst[pos++] = Instance.missingValue(); } // Add probability and rank newInst[pos++] = -Double.MAX_VALUE; // newInst[pos++] = Instance.missingValue(); } else if (i == m_KeyphrasesAtt) { newInst[pos++] = 1; // Keyphrase } else { newInst[pos++] = instance.value(i); } Instance inst = new Instance(instance.weight(), newInst); inst.setDataset(outputFormatPeek()); vector.addElement(inst); } } } // Sort phrases according to their distance (stable sort) double[] vals = new double[vector.size()]; for (int i = 0; i < vals.length; i++) { vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex); } FastVector newVector = new FastVector(vector.size()); int[] sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their tfxidf value (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their probability (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Compute rank of phrases. Check for subphrases that are ranked // lower than superphrases and assign probability -1 and set the // rank to Integer.MAX_VALUE int rank = 1; for (int i = 0; i < vals.length; i++) { Instance currentInstance = (Instance) vector.elementAt(i); // Short cut: if phrase very unlikely make rank very low and continue if (Utils.grOrEq(vals[i], 1.0)) { currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE); continue; } // Otherwise look for super phrase starting with first phrase // in list that has same probability, TFxIDF value, and distance as // current phrase. We do this to catch all superphrases // that have same probability, TFxIDF value and distance as current phrase. int startInd = i; while (startInd < vals.length) { Instance inst = (Instance) vector.elementAt(startInd); if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex)) || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex)) || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) { break; } startInd++; } currentInstance.setValue(probsAttIndex + 1, rank++); } return vector; }
From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java
License:Open Source License
/** * builds the classifier/* ww w. j av a2 s .co m*/ * * @param data the training instances * @throws Exception if something goes wrong */ @Override public void buildClassifier(Instances data) throws Exception { // can classifier handle the data? getCapabilities().testWithFail(data); // save original header (needed for clusters to classes output) m_OriginalHeader = data.stringFreeStructure(); // remove class attribute for clusterer Instances clusterData = new Instances(data); clusterData.setClassIndex(-1); clusterData.deleteAttributeAt(data.classIndex()); m_ClusteringHeader = clusterData.stringFreeStructure(); if (m_ClusteringHeader.numAttributes() == 0) { System.err.println("Data contains only class attribute, defaulting to ZeroR model."); m_ZeroR = new ZeroR(); m_ZeroR.buildClassifier(data); } else { m_ZeroR = null; // build clusterer m_ActualClusterer = AbstractClusterer.makeCopy(m_Clusterer); m_ActualClusterer.buildClusterer(clusterData); if (!getLabelAllClusters()) { // determine classes-to-clusters mapping ClusterEvaluation eval = new ClusterEvaluation(); eval.setClusterer(m_ActualClusterer); eval.evaluateClusterer(clusterData); double[] clusterAssignments = eval.getClusterAssignments(); int[][] counts = new int[eval.getNumClusters()][m_OriginalHeader.numClasses()]; int[] clusterTotals = new int[eval.getNumClusters()]; double[] best = new double[eval.getNumClusters() + 1]; double[] current = new double[eval.getNumClusters() + 1]; for (int i = 0; i < data.numInstances(); i++) { Instance instance = data.instance(i); if (!instance.classIsMissing()) { counts[(int) clusterAssignments[i]][(int) instance.classValue()]++; clusterTotals[(int) clusterAssignments[i]]++; } } best[eval.getNumClusters()] = Double.MAX_VALUE; ClusterEvaluation.mapClasses(eval.getNumClusters(), 0, counts, clusterTotals, current, best, 0); m_ClustersToClasses = new double[best.length]; System.arraycopy(best, 0, m_ClustersToClasses, 0, best.length); } else { m_ClusterClassProbs = new double[m_ActualClusterer.numberOfClusters()][data.numClasses()]; for (int i = 0; i < data.numInstances(); i++) { Instance clusterInstance = clusterData.instance(i); Instance originalInstance = data.instance(i); if (!originalInstance.classIsMissing()) { double[] probs = m_ActualClusterer.distributionForInstance(clusterInstance); for (int j = 0; j < probs.length; j++) { m_ClusterClassProbs[j][(int) originalInstance.classValue()] += probs[j]; } } } for (int i = 0; i < m_ClusterClassProbs.length; i++) { Utils.normalize(m_ClusterClassProbs[i]); } } } }
From source file:com.yahoo.labs.samoa.instances.WekaToSamoaInstanceConverter.java
License:Apache License
/** * Samoa instance from weka instance./* ww w . j a v a 2 s. c o m*/ * * @param inst the inst * @return the instance */ public Instance samoaInstance(weka.core.Instance inst) { Instance samoaInstance; if (inst instanceof weka.core.SparseInstance) { double[] attributeValues = new double[inst.numValues()]; int[] indexValues = new int[inst.numValues()]; for (int i = 0; i < inst.numValues(); i++) { if (inst.index(i) != inst.classIndex()) { attributeValues[i] = inst.valueSparse(i); indexValues[i] = inst.index(i); } } samoaInstance = new SparseInstance(inst.weight(), attributeValues, indexValues, inst.numAttributes()); } else { samoaInstance = new DenseInstance(inst.weight(), inst.toDoubleArray()); //samoaInstance.deleteAttributeAt(inst.classIndex()); } if (this.samoaInstanceInformation == null) { this.samoaInstanceInformation = this.samoaInstancesInformation(inst.dataset()); } samoaInstance.setDataset(samoaInstanceInformation); samoaInstance.setClassValue(inst.classValue()); return samoaInstance; }
From source file:core.classifier.MyFirstClassifier.java
License:Open Source License
/** * Method for building the classifier. Implements a one-against-one * wrapper for multi-class problems.//from ww w .ja va 2s .c o m * * @param insts the set of training instances * @throws Exception if the classifier can't be built successfully */ public void buildClassifier(Instances insts) throws Exception { if (!m_checksTurnedOff) { // can classifier handle the data? getCapabilities().testWithFail(insts); // remove instances with missing class insts = new Instances(insts); insts.deleteWithMissingClass(); /* Removes all the instances with weight equal to 0. MUST be done since condition (8) of Keerthi's paper is made with the assertion Ci > 0 (See equation (3a). */ Instances data = new Instances(insts, insts.numInstances()); for (int i = 0; i < insts.numInstances(); i++) { if (insts.instance(i).weight() > 0) data.add(insts.instance(i)); } if (data.numInstances() == 0) { throw new Exception("No training instances left after removing " + "instances with weight 0!"); } insts = data; } if (!m_checksTurnedOff) { m_Missing = new ReplaceMissingValues(); m_Missing.setInputFormat(insts); insts = Filter.useFilter(insts, m_Missing); } else { m_Missing = null; } if (getCapabilities().handles(Capability.NUMERIC_ATTRIBUTES)) { boolean onlyNumeric = true; if (!m_checksTurnedOff) { for (int i = 0; i < insts.numAttributes(); i++) { if (i != insts.classIndex()) { if (!insts.attribute(i).isNumeric()) { onlyNumeric = false; break; } } } } if (!onlyNumeric) { m_NominalToBinary = new NominalToBinary(); m_NominalToBinary.setInputFormat(insts); insts = Filter.useFilter(insts, m_NominalToBinary); } else { m_NominalToBinary = null; } } else { m_NominalToBinary = null; } if (m_filterType == FILTER_STANDARDIZE) { m_Filter = new Standardize(); m_Filter.setInputFormat(insts); insts = Filter.useFilter(insts, m_Filter); } else if (m_filterType == FILTER_NORMALIZE) { m_Filter = new Normalize(); m_Filter.setInputFormat(insts); insts = Filter.useFilter(insts, m_Filter); } else { m_Filter = null; } m_classIndex = insts.classIndex(); m_classAttribute = insts.classAttribute(); m_KernelIsLinear = (m_kernel instanceof PolyKernel) && (((PolyKernel) m_kernel).getExponent() == 1.0); // Generate subsets representing each class Instances[] subsets = new Instances[insts.numClasses()]; for (int i = 0; i < insts.numClasses(); i++) { subsets[i] = new Instances(insts, insts.numInstances()); } for (int j = 0; j < insts.numInstances(); j++) { Instance inst = insts.instance(j); subsets[(int) inst.classValue()].add(inst); } for (int i = 0; i < insts.numClasses(); i++) { subsets[i].compactify(); } // Build the binary classifiers Random rand = new Random(m_randomSeed); m_classifiers = new BinarySMO[insts.numClasses()][insts.numClasses()]; for (int i = 0; i < insts.numClasses(); i++) { for (int j = i + 1; j < insts.numClasses(); j++) { m_classifiers[i][j] = new BinarySMO(); m_classifiers[i][j].setKernel(Kernel.makeCopy(getKernel())); Instances data = new Instances(insts, insts.numInstances()); for (int k = 0; k < subsets[i].numInstances(); k++) { data.add(subsets[i].instance(k)); } for (int k = 0; k < subsets[j].numInstances(); k++) { data.add(subsets[j].instance(k)); } data.compactify(); data.randomize(rand); m_classifiers[i][j].buildClassifier(data, i, j, m_fitLogisticModels, m_numFolds, m_randomSeed); } } }
From source file:core.ClusterEvaluationEX.java
License:Open Source License
/** * Evaluates cluster assignments with respect to actual class labels. * Assumes that m_Clusterer has been trained and tested on * inst (minus the class)./* w w w.jav a2 s . com*/ * * @param inst the instances (including class) to evaluate with respect to * @param fileName the name of the test file for incremental testing, * if "" or null then not used * @throws Exception if something goes wrong */ private void evaluateClustersWithRespectToClass(Instances inst, String fileName) throws Exception { int numClasses = inst.classAttribute().numValues(); int[][] counts = new int[m_numClusters][numClasses]; int[] clusterTotals = new int[m_numClusters]; double[] best = new double[m_numClusters + 1]; double[] current = new double[m_numClusters + 1]; DataSource source = null; Instances instances = null; Instance instance = null; int i; int numInstances; if (fileName == null) fileName = ""; if (fileName.length() != 0) { source = new DataSource(fileName); } else source = new DataSource(inst); instances = source.getStructure(inst.classIndex()); i = 0; while (source.hasMoreElements(instances)) { instance = source.nextElement(instances); if (m_clusterAssignments[i] >= 0) { counts[(int) m_clusterAssignments[i]][(int) instance.classValue()]++; clusterTotals[(int) m_clusterAssignments[i]]++; } i++; } numInstances = i; best[m_numClusters] = Double.MAX_VALUE; mapClasses(m_numClusters, 0, counts, clusterTotals, current, best, 0); m_clusteringResults.append("\n\nClass attribute: " + inst.classAttribute().name() + "\n"); m_clusteringResults.append("Classes to Clusters:\n"); String matrixString = toMatrixString(counts, clusterTotals, new Instances(inst, 0)); m_clusteringResults.append(matrixString).append("\n"); int Cwidth = 1 + (int) (Math.log(m_numClusters) / Math.log(10)); // add the minimum error assignment for (i = 0; i < m_numClusters; i++) { if (clusterTotals[i] > 0) { m_clusteringResults.append("Cluster " + Utils.doubleToString((double) i, Cwidth, 0)); m_clusteringResults.append(" <-- "); if (best[i] < 0) { m_clusteringResults.append("No class\n"); } else { m_clusteringResults.append(inst.classAttribute().value((int) best[i])).append("\n"); } } } m_clusteringResults.append("\nIncorrectly clustered instances :\t" + best[m_numClusters] + "\t" + (Utils.doubleToString((best[m_numClusters] / numInstances * 100.0), 8, 4)) + " %\n"); // copy the class assignments m_classToCluster = new int[m_numClusters]; for (i = 0; i < m_numClusters; i++) { m_classToCluster[i] = (int) best[i]; } }