List of usage examples for weka.core Instance setDataset
public void setDataset(Instances instances);
From source file:com.actelion.research.orbit.imageAnalysis.tasks.ObjectClassificationWorker.java
License:Open Source License
@Override protected void doWork() { try {//from w ww . j a v a 2s . co m if (rf.getObjectSegmentationList() == null) { logger.error("objectSegmentation is null. Please create a objectSegmentationList first."); return; } if (rf.getClassImage() == null) { logger.error("classImage is null. Please do a tissue classification first."); return; } if (classifier == null) { logger.error("classifier is null, so classification cannot be done."); return; } classCount = new HashMap<Integer, Integer>(); for (int idx = 0; idx < model.getClassShapes().size(); idx++) { classCount.put(idx, 0); } int[] classificationList = new int[rf.getObjectSegmentationList().size()]; Instance inst; int i = 0; Rectangle bounds; boolean inTileList = true; logger.trace("starting the classification of " + rf.getObjectSegmentationList().size() + " cells"); //logger.trace("classifier: "+classifier); //logger.trace("dataSet: "+dataset); List<Rectangle> tileBounds = null; if (tileList != null && tileList.size() > 0) { tileBounds = new ArrayList<Rectangle>(tileList.size()); for (Point tile : tileList) { if (tile.x != -1 && tile.y != -1) { // all tiles marker tileBounds.add(rf.bimg.getImage().getTileRect(tile.x, tile.y)); } } } for (Shape shape : rf.getObjectSegmentationList()) { if (shape.getBounds().width * shape.getBounds().height < 5) { // at least 2x2 pixel if (logger.isTraceEnabled()) logger.trace("skipping shape: " + shape.getBounds()); classificationList[i] = -1; i++; continue; } int sampleSize = Math.min(3, rf.bimg.getImage().getSampleModel().getNumBands()); // was always 1 before! (max 3 because alpha should be ignored) double[] featsAll = new ObjectFeatureBuilderTiled(model).buildFeatures(shape, Double.NaN, rf, rf.getClassImage(), sampleSize, 0, 0); double[] feats = Arrays.copyOfRange(featsAll, 0, featsAll.length - ObjectFeatureBuilderTiled.SkipTailForClassification); // skip some non relevant attributes like centerX/Y inst = new DenseInstance(1.0d, feats); inst.setDataset(dataset); double classVal = Double.NaN; try { classVal = classifier.classifyInstance(inst); } catch (Exception e) { e.printStackTrace(); classificationList[i] = -1; logger.error("The classifier is not a cell classification classifier."); return; } if (classVal != Double.NaN) { //if (logger.isTraceEnabled()) logger.trace("Obj "+i+": Class "+classVal); classificationList[i] = (int) classVal; if (!classCount.containsKey((int) classVal)) classCount.put((int) classVal, 0); classCount.put((int) classVal, classCount.get((int) classVal) + 1); } else { classificationList[i] = -1; logger.trace("Class is null"); } i++; } // shape rf.setObjectClassificationList(classificationList); // not save! if (originalFrame != null) { if (rf.getObjectClassificationList() != null) { this.originalFrame.setObjectClassificationList(rf.getObjectClassificationList().clone()); } this.originalFrame.setClassShapes(oldClassShapes); this.originalFrame.setClassImage(rf.getClassImage()); } if (oldClassShapes != null) { rf.setClassShapes(oldClassShapes); logger.debug("ClassShape Workaround: Old classShapes set."); } if (generateTaskResult) { StringBuilder sb = new StringBuilder("Finished classifying " + i + " objects.\n"); sb.append("Class Frequencies:\n"); for (int c = 0; c < rf.getClassShapes().size(); c++) { if (classCount.containsKey(c)) { int cnt = classCount.get(c); String cName = rf.getClassShapes().get(c).getName(); sb.append(cName + ": " + cnt + "\n"); } else { int cnt = 0; String cName = rf.getClassShapes().get(c).getName(); sb.append(cName + ": " + cnt + "\n"); } } taskResult = new TaskResult("Cell Classification Result", sb.toString()); logger.debug(sb.toString()); } } catch (Exception e) { e.printStackTrace(); } }
From source file:com.dhamacher.sentimentanalysis4tweets.preprocessing.TweetClassifier.java
License:Apache License
/** * Method that converts a text message into an instance. * * @param text the message content to convert * @param data the header information//from w w w.ja v a 2s .com * @return the generated Instance */ private Instance makeInstance(String text, Instances data) { Instance instance = new Instance(2); Attribute messageAtt = data.attribute("content"); instance.setValue(messageAtt, messageAtt.addStringValue(text)); instance.setDataset(data); return instance; }
From source file:com.entopix.maui.filters.MauiFilter.java
License:Open Source License
/** * Converts an instance.//from w w w. j a va 2 s. c om */ private FastVector convertInstance(Instance instance, boolean training) { FastVector vector = new FastVector(); String fileName = instance.stringValue(fileNameAtt); if (debugMode) { log.info("-- Converting instance for document " + fileName); } // Get the key phrases for the document HashMap<String, Counter> hashKeyphrases = null; if (!instance.isMissing(keyphrasesAtt)) { String keyphrases = instance.stringValue(keyphrasesAtt); hashKeyphrases = getGivenKeyphrases(keyphrases); } // Get the document text String documentText = instance.stringValue(documentAtt); // Compute the candidate topics HashMap<String, Candidate> candidateList; if (allCandidates != null && allCandidates.containsKey(instance)) { candidateList = allCandidates.get(instance); } else { candidateList = getCandidates(documentText); } if (debugMode) { log.info(candidateList.size() + " candidates "); } // Set indices for key attributes int tfidfAttIndex = documentAtt + 2; int distAttIndex = documentAtt + 3; int probsAttIndex = documentAtt + numFeatures; int countPos = 0; int countNeg = 0; // Go through the phrases and convert them into instances for (Candidate candidate : candidateList.values()) { if (candidate.getFrequency() < minOccurFrequency) { continue; } String name = candidate.getName(); String orig = candidate.getBestFullForm(); if (!vocabularyName.equals("none")) { orig = candidate.getTitle(); } double[] vals = computeFeatureValues(candidate, training, hashKeyphrases, candidateList); Instance inst = new Instance(instance.weight(), vals); inst.setDataset(classifierData); double[] probs = null; try { // Get probability of a phrase being key phrase probs = classifier.distributionForInstance(inst); } catch (Exception e) { log.error("Exception while getting probability for candidate " + candidate.getName()); continue; } double prob = probs[0]; if (nominalClassValue) { prob = probs[1]; } // Compute attribute values for final instance double[] newInst = new double[instance.numAttributes() + numFeatures + 2]; int pos = 0; for (int i = 1; i < instance.numAttributes(); i++) { if (i == documentAtt) { // output of values for a given phrase: // 0 Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(name); newInst[pos++] = index; // 1 Add original version if (orig != null) { index = outputFormatPeek().attribute(pos).addStringValue(orig); } else { index = outputFormatPeek().attribute(pos).addStringValue(name); } // 2 newInst[pos++] = index; // Add features newInst[pos++] = inst.value(tfIndex); // 3 newInst[pos++] = inst.value(idfIndex); // 4 newInst[pos++] = inst.value(tfidfIndex); // 5 newInst[pos++] = inst.value(firstOccurIndex); // 6 newInst[pos++] = inst.value(lastOccurIndex); // 7 newInst[pos++] = inst.value(spreadOccurIndex); // 8 newInst[pos++] = inst.value(domainKeyphIndex); // 9 newInst[pos++] = inst.value(lengthIndex); // 10 newInst[pos++] = inst.value(generalityIndex); // 11 newInst[pos++] = inst.value(nodeDegreeIndex); // 12 newInst[pos++] = inst.value(invWikipFreqIndex); // 13 newInst[pos++] = inst.value(totalWikipKeyphrIndex); // 14 newInst[pos++] = inst.value(wikipGeneralityIndex); // 15 // Add probability probsAttIndex = pos; newInst[pos++] = prob; // 16 // Set rank to missing (computed below) newInst[pos++] = Instance.missingValue(); // 17 } else if (i == keyphrasesAtt) { newInst[pos++] = inst.classValue(); } else { newInst[pos++] = instance.value(i); } } Instance ins = new Instance(instance.weight(), newInst); ins.setDataset(outputFormatPeek()); vector.addElement(ins); if (inst.classValue() == 0) { countNeg++; } else { countPos++; } } if (debugMode) { log.info(countPos + " positive; " + countNeg + " negative instances"); } // Sort phrases according to their distance (stable sort) double[] vals = new double[vector.size()]; for (int i = 0; i < vals.length; i++) { vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex); } FastVector newVector = new FastVector(vector.size()); int[] sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their tfxidf value (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their probability (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Compute rank of phrases. Check for subphrases that are ranked // lower than superphrases and assign probability -1 and set the // rank to Integer.MAX_VALUE int rank = 1; for (int i = 0; i < vals.length; i++) { Instance currentInstance = (Instance) vector.elementAt(i); // log.info(vals[i] + "\t" + currentInstance); // Short cut: if phrase very unlikely make rank very low and // continue if (Utils.grOrEq(vals[i], 1.0)) { currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE); continue; } // Otherwise look for super phrase starting with first phrase // in list that has same probability, TFxIDF value, and distance as // current phrase. We do this to catch all superphrases // that have same probability, TFxIDF value and distance as current // phrase. int startInd = i; while (startInd < vals.length) { Instance inst = (Instance) vector.elementAt(startInd); if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex)) || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex)) || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) { break; } startInd++; } currentInstance.setValue(probsAttIndex + 1, rank++); } return vector; }
From source file:com.esda.util.StringToWordVector.java
License:Open Source License
/** * Converts the instance w/o normalization. * * @oaram instance the instance to convert * @param v//from w w w. j a v a2 s . co m * @return the conerted instance */ private int convertInstancewoDocNorm(Instance instance, FastVector v) { // Convert the instance into a sorted set of indexes TreeMap contained = new TreeMap(); // Copy all non-converted attributes from input to output int firstCopy = 0; for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (!m_SelectedRange.isInRange(i)) { if (getInputFormat().attribute(i).type() != Attribute.STRING && getInputFormat().attribute(i).type() != Attribute.RELATIONAL) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { contained.put(new Integer(firstCopy), new Double(instance.value(i))); } } else { if (instance.isMissing(i)) { contained.put(new Integer(firstCopy), new Double(Double.NaN)); } else if (getInputFormat().attribute(i).type() == Attribute.STRING) { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(firstCopy).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(firstCopy) .addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(firstCopy) .addStringValue(instance.stringValue(i)); contained.put(new Integer(firstCopy), new Double(newIndex)); } else { // relational if (outputFormatPeek().attribute(firstCopy).numValues() == 0) { Instances relationalHeader = outputFormatPeek().attribute(firstCopy).relation(); // hack to defeat sparse instances bug outputFormatPeek().attribute(firstCopy).addRelation(relationalHeader); } int newIndex = outputFormatPeek().attribute(firstCopy) .addRelation(instance.relationalValue(i)); contained.put(new Integer(firstCopy), new Double(newIndex)); } } firstCopy++; } } for (int j = 0; j < instance.numAttributes(); j++) { // if ((getInputFormat().attribute(j).type() == Attribute.STRING) if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) { m_Tokenizer.tokenize(instance.stringValue(j)); while (m_Tokenizer.hasMoreElements()) { String word = (String) m_Tokenizer.nextElement(); if (this.m_lowerCaseTokens == true) word = word.toLowerCase(); word = m_Stemmer.stem(word); Integer index = (Integer) m_Dictionary.get(word); if (index != null) { if (m_OutputCounts) { // Separate if here rather than // two lines down to avoid // hashtable lookup Double count = (Double) contained.get(index); if (count != null) { contained.put(index, new Double(count.doubleValue() + 1.0)); } else { contained.put(index, new Double(1)); } } else { contained.put(index, new Double(1)); } } } } } // Doing TFTransform if (m_TFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = Math.log(val + 1); contained.put(index, new Double(val)); } } } // Doing IDFTransform if (m_IDFTransform == true) { Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); if (index.intValue() >= firstCopy) { double val = ((Double) contained.get(index)).doubleValue(); val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]); contained.put(index, new Double(val)); } } } // Convert the set to structures needed to create a sparse instance. double[] values = new double[contained.size()]; int[] indices = new int[contained.size()]; Iterator it = contained.keySet().iterator(); for (int i = 0; it.hasNext(); i++) { Integer index = (Integer) it.next(); Double value = (Double) contained.get(index); values[i] = value.doubleValue(); indices[i] = index.intValue(); } Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes()); inst.setDataset(outputFormatPeek()); v.addElement(inst); return firstCopy; }
From source file:com.openkm.kea.filter.KEAFilter.java
License:Open Source License
/** * Converts an instance.//from www . j a v a2 s .c o m */ private FastVector convertInstance(Instance instance, boolean training) throws Exception { FastVector vector = new FastVector(); if (m_Debug) { log.info("-- Converting instance"); } // Get the key phrases for the document HashMap<String, Counter> hashKeyphrases = null; HashMap<String, Counter> hashKeysEval = null; if (!instance.isMissing(m_KeyphrasesAtt)) { String keyphrases = instance.stringValue(m_KeyphrasesAtt); hashKeyphrases = getGivenKeyphrases(keyphrases, false); hashKeysEval = getGivenKeyphrases(keyphrases, true); } // Get the phrases for the document HashMap<String, FastVector> hash = new HashMap<String, FastVector>(); int length = getPhrases(hash, instance.stringValue(m_DocumentAtt)); // hash = getComposits(hash); /* Experimental: To compute how many of the manual keyphrases appear in the documents: log.info("Doc phrases found " + hash.size()); log.info("Manual keyphrases: "); Iterator iter = hashKeyphrases.keySet().iterator(); int count = 0; while (iter.hasNext()) { String id = (String)iter.next(); if (hash.containsKey(id)) { count++; } } double max_recall = (double)count/(double)hashKeyphrases.size(); m_max_recall += max_recall; doc++; double avg_m_max_recall = m_max_recall/(double)doc; String file = instance.stringValue(2); log.info(count + " out of " + hashKeyphrases.size() + " are in the document "); log.info("Max recall : " + avg_m_max_recall + " on " + doc + " documents "); */ // Compute number of extra attributes int numFeatures = 5; if (m_Debug) { if (m_KFused) { numFeatures = numFeatures + 1; } } if (m_STDEVfeature) { numFeatures = numFeatures + 1; } if (m_NODEfeature) { numFeatures = numFeatures + 1; } if (m_LENGTHfeature) { numFeatures = numFeatures + 1; } // Set indices of key attributes //int phraseAttIndex = m_DocumentAtt; int tfidfAttIndex = m_DocumentAtt + 2; int distAttIndex = m_DocumentAtt + 3; int probsAttIndex = m_DocumentAtt + numFeatures - 1; //int classAttIndex = numFeatures; // Go through the phrases and convert them into instances Iterator<String> it = hash.keySet().iterator(); while (it.hasNext()) { String id = it.next(); FastVector phraseInfo = (FastVector) hash.get(id); double[] vals = featVals(id, phraseInfo, training, hashKeysEval, hashKeyphrases, length, hash); Instance inst = new Instance(instance.weight(), vals); inst.setDataset(m_ClassifierData); // Get probability of a phrase being key phrase double[] probs = m_Classifier.distributionForInstance(inst); // If simple Naive Bayes used, change here to //double prob = probs[1]; double prob = probs[0]; // Compute attribute values for final instance double[] newInst = new double[instance.numAttributes() + numFeatures]; int pos = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == m_DocumentAtt) { // output of values for a given phrase: // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(id); newInst[pos++] = index; // Add original version String orig = (String) phraseInfo.elementAt(2); if (orig != null) { index = outputFormatPeek().attribute(pos).addStringValue(orig); } else { index = outputFormatPeek().attribute(pos).addStringValue(id); } newInst[pos++] = index; // Add TFxIDF newInst[pos++] = inst.value(m_TfidfIndex); // Add distance newInst[pos++] = inst.value(m_FirstOccurIndex); // Add other features if (m_Debug) { if (m_KFused) { newInst[pos++] = inst.value(m_KeyFreqIndex); } } if (m_STDEVfeature) { newInst[pos++] = inst.value(m_STDEVIndex); } if (m_NODEfeature) { newInst[pos++] = inst.value(m_NodeIndex); } if (m_LENGTHfeature) { newInst[pos++] = inst.value(m_LengthIndex); } // Add probability probsAttIndex = pos; newInst[pos++] = prob; // Set rank to missing (computed below) newInst[pos++] = Instance.missingValue(); } else if (i == m_KeyphrasesAtt) { newInst[pos++] = inst.classValue(); } else { newInst[pos++] = instance.value(i); } } Instance ins = new Instance(instance.weight(), newInst); ins.setDataset(outputFormatPeek()); vector.addElement(ins); } // Add dummy instances for keyphrases that don't occur // in the document if (hashKeysEval != null) { Iterator<String> phrases = hashKeysEval.keySet().iterator(); while (phrases.hasNext()) { String phrase = phrases.next(); double[] newInst = new double[instance.numAttributes() + numFeatures]; int pos = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == m_DocumentAtt) { // log.info("Here: " + phrase); // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(phrase); newInst[pos++] = (double) index; // Add original version index = outputFormatPeek().attribute(pos).addStringValue(phrase); newInst[pos++] = (double) index; // Add TFxIDF newInst[pos++] = Instance.missingValue(); // Add distance newInst[pos++] = Instance.missingValue(); // Add other features if (m_Debug) { if (m_KFused) { newInst[pos++] = Instance.missingValue(); } } if (m_STDEVfeature) { newInst[pos++] = Instance.missingValue(); } if (m_NODEfeature) { newInst[pos++] = Instance.missingValue(); } if (m_LENGTHfeature) { newInst[pos++] = Instance.missingValue(); } // Add probability and rank newInst[pos++] = -Double.MAX_VALUE; // newInst[pos++] = Instance.missingValue(); } else if (i == m_KeyphrasesAtt) { newInst[pos++] = 1; // Keyphrase } else { newInst[pos++] = instance.value(i); } Instance inst = new Instance(instance.weight(), newInst); inst.setDataset(outputFormatPeek()); vector.addElement(inst); } } } // Sort phrases according to their distance (stable sort) double[] vals = new double[vector.size()]; for (int i = 0; i < vals.length; i++) { vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex); } FastVector newVector = new FastVector(vector.size()); int[] sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their tfxidf value (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their probability (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Compute rank of phrases. Check for subphrases that are ranked // lower than superphrases and assign probability -1 and set the // rank to Integer.MAX_VALUE int rank = 1; for (int i = 0; i < vals.length; i++) { Instance currentInstance = (Instance) vector.elementAt(i); // Short cut: if phrase very unlikely make rank very low and continue if (Utils.grOrEq(vals[i], 1.0)) { currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE); continue; } // Otherwise look for super phrase starting with first phrase // in list that has same probability, TFxIDF value, and distance as // current phrase. We do this to catch all superphrases // that have same probability, TFxIDF value and distance as current phrase. int startInd = i; while (startInd < vals.length) { Instance inst = (Instance) vector.elementAt(startInd); if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex)) || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex)) || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) { break; } startInd++; } currentInstance.setValue(probsAttIndex + 1, rank++); } return vector; }
From source file:com.openkm.kea.filter.KEAPhraseFilter.java
License:Open Source License
/** * Converts an instance by removing all non-alphanumeric characters * from its string attribute values./*from ww w. j a v a 2s. c o m*/ */ private void convertInstance(Instance instance) throws Exception { double[] instVals = new double[instance.numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { if (!instance.attribute(i).isString() || instance.isMissing(i)) { instVals[i] = instance.value(i); } else { if (!m_SelectCols.isInRange(i)) { int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i)); instVals[i] = (double) index; continue; } // aly: str = text of the document String str = instance.stringValue(i); String tokenized = tokenize(str); // aly: resultStr is the clean version of str // log.info(resultStr.toString()); int index = getOutputFormat().attribute(i).addStringValue(tokenized); instVals[i] = (double) index; } } Instance inst = new Instance(instance.weight(), instVals); inst.setDataset(getOutputFormat()); push(inst); }
From source file:com.openkm.kea.filter.NumbersFilter.java
License:Open Source License
/** * Converts an instance. A phrase boundary is inserted where * a number is found.// ww w. ja v a2s .c om */ private void convertInstance(Instance instance) throws Exception { double[] instVals = new double[instance.numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { if ((!instance.attribute(i).isString()) || instance.isMissing(i)) { instVals[i] = instance.value(i); } else { String str = instance.stringValue(i); StringBuffer resultStr = new StringBuffer(); StringTokenizer tok = new StringTokenizer(str, " \t\n", true); while (tok.hasMoreTokens()) { String token = tok.nextToken(); // Everything that doesn't contain at least // one letter is considered to be a number boolean isNumber = true; for (int j = 0; j < token.length(); j++) { if (Character.isLetter(token.charAt(j))) { isNumber = false; break; } } if (!isNumber) { resultStr.append(token); } else { if (token.equals(" ") || token.equals("\t") || token.equals("\n")) { resultStr.append(token); } else { resultStr.append(" \n "); } } } int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString()); instVals[i] = (double) index; } } Instance inst = new Instance(instance.weight(), instVals); inst.setDataset(getOutputFormat()); push(inst); }
From source file:com.rapidminer.tools.WekaInstancesAdaptor.java
License:Open Source License
/** Gets an example and creates a Weka instance. */ private Instance toWekaInstance(Example example) { int numberOfRegularValues = example.getAttributes().size(); int numberOfValues = numberOfRegularValues + (labelAttribute != null ? 1 : 0); double[] values = new double[numberOfValues]; // set regular attribute values if (taskType == ASSOCIATION_RULE_MINING) { int a = 0; for (Attribute attribute : exampleSet.getAttributes()) { double value = example.getValue(attribute); if (attribute.isNominal()) { if (value == mostFrequent[a]) value = Double.NaN; // sets the most frequent value to missing // for association learning }//from w w w .j a v a2 s . c o m values[a] = value; a++; } } else { int[] nonDefaultIndices = exampleTransform.getNonDefaultAttributeIndices(example); double[] nonDefaultValues = exampleTransform.getNonDefaultAttributeValues(example, nonDefaultIndices); for (int a = 0; a < nonDefaultIndices.length; a++) { values[nonDefaultIndices[a]] = nonDefaultValues[a]; } } // set label value if necessary switch (taskType) { case LEARNING: values[values.length - 1] = example.getValue(labelAttribute); break; case PREDICTING: values[values.length - 1] = Double.NaN; break; case WEIGHTING: if (labelAttribute != null) values[values.length - 1] = example.getValue(labelAttribute); break; default: break; } // get instance weight double weight = 1.0d; if (this.weightAttribute != null) weight = example.getValue(this.weightAttribute); // create new instance Instance instance = new Instance(weight, values); instance.setDataset(this); return instance; }
From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java
License:Open Source License
/** * Returns class probability distribution for the given instance. * /*from ww w . j a v a 2 s . c o m*/ * @param instance the instance to be classified * @return the class probabilities * @throws Exception if an error occurred during the prediction */ @Override public double[] distributionForInstance(Instance instance) throws Exception { if (m_ZeroR != null) { return m_ZeroR.distributionForInstance(instance); } else { double[] result = new double[instance.numClasses()]; if (m_ActualClusterer != null) { // build new instance Instances tempData = m_ClusteringHeader.stringFreeStructure(); double[] values = new double[tempData.numAttributes()]; int n = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == instance.classIndex()) { continue; } if (instance.attribute(i).isString()) { values[n] = tempData.attribute(n).addStringValue(instance.stringValue(i)); } else if (instance.attribute(i).isRelationValued()) { values[n] = tempData.attribute(n).addRelation(instance.relationalValue(i)); } else { values[n] = instance.value(i); } n++; } Instance newInst = new DenseInstance(instance.weight(), values); newInst.setDataset(tempData); if (!getLabelAllClusters()) { // determine cluster/class double r = m_ClustersToClasses[m_ActualClusterer.clusterInstance(newInst)]; if (r == -1) { return result; // Unclassified } else { result[(int) r] = 1.0; return result; } } else { double[] classProbs = new double[instance.numClasses()]; double[] dist = m_ActualClusterer.distributionForInstance(newInst); for (int i = 0; i < dist.length; i++) { for (int j = 0; j < instance.numClasses(); j++) { classProbs[j] += dist[i] * m_ClusterClassProbs[i][j]; } } Utils.normalize(classProbs); return classProbs; } } else { return result; // Unclassified } } }
From source file:com.yahoo.labs.samoa.instances.SamoaToWekaInstanceConverter.java
License:Apache License
/** * Weka instance.//from w ww. j a v a 2 s.c o m * * @param inst the inst * @return the weka.core. instance */ public weka.core.Instance wekaInstance(Instance inst) { weka.core.Instance wekaInstance; if (((InstanceImpl) inst).instanceData instanceof SparseInstanceData) { InstanceImpl instance = (InstanceImpl) inst; SparseInstanceData sparseInstanceData = (SparseInstanceData) instance.instanceData; wekaInstance = new weka.core.SparseInstance(instance.weight(), sparseInstanceData.getAttributeValues(), sparseInstanceData.getIndexValues(), sparseInstanceData.getNumberAttributes()); /*if (this.wekaInstanceInformation == null) { this.wekaInstanceInformation = this.wekaInstancesInformation(inst.dataset()); } wekaInstance.insertAttributeAt(inst.classIndex()); wekaInstance.setDataset(wekaInstanceInformation); wekaInstance.setClassValue(inst.classValue()); //wekaInstance.setValueSparse(wekaInstance.numAttributes(), inst.classValue());*/ } else { Instance instance = inst; wekaInstance = new weka.core.DenseInstance(instance.weight(), instance.toDoubleArray()); /* if (this.wekaInstanceInformation == null) { this.wekaInstanceInformation = this.wekaInstancesInformation(inst.dataset()); } //We suppose that the class is the last attibute. We should deal when this is not the case wekaInstance.insertAttributeAt(inst.classIndex()); wekaInstance.setDataset(wekaInstanceInformation); wekaInstance.setClassValue(inst.classValue());*/ } if (this.wekaInstanceInformation == null) { this.wekaInstanceInformation = this.wekaInstancesInformation(inst.dataset()); } //wekaInstance.insertAttributeAt(inst.classIndex()); wekaInstance.setDataset(wekaInstanceInformation); if (inst.numOutputAttributes() == 1) { wekaInstance.setClassValue(inst.classValue()); } return wekaInstance; }