Example usage for weka.core Instance setDataset

Introduction

In this page you can find the example usage for weka.core Instance setDataset.

Prototype

public void setDataset(Instances instances);

Source Link

Document

Sets the reference to the dataset.

Usage

From source file:com.actelion.research.orbit.imageAnalysis.tasks.ObjectClassificationWorker.java

License:Open Source License

@Override
protected void doWork() {
    try {//from  w ww  . j  a  v  a 2s .  co  m
        if (rf.getObjectSegmentationList() == null) {
            logger.error("objectSegmentation is null. Please create a objectSegmentationList first.");
            return;
        }

        if (rf.getClassImage() == null) {
            logger.error("classImage is null. Please do a tissue classification first.");
            return;
        }

        if (classifier == null) {
            logger.error("classifier is null, so classification cannot be done.");
            return;
        }

        classCount = new HashMap<Integer, Integer>();
        for (int idx = 0; idx < model.getClassShapes().size(); idx++) {
            classCount.put(idx, 0);
        }

        int[] classificationList = new int[rf.getObjectSegmentationList().size()];
        Instance inst;
        int i = 0;
        Rectangle bounds;
        boolean inTileList = true;
        logger.trace("starting the classification of " + rf.getObjectSegmentationList().size() + " cells");
        //logger.trace("classifier: "+classifier);
        //logger.trace("dataSet: "+dataset);
        List<Rectangle> tileBounds = null;
        if (tileList != null && tileList.size() > 0) {
            tileBounds = new ArrayList<Rectangle>(tileList.size());
            for (Point tile : tileList) {
                if (tile.x != -1 && tile.y != -1) { // all tiles marker
                    tileBounds.add(rf.bimg.getImage().getTileRect(tile.x, tile.y));
                }
            }
        }

        for (Shape shape : rf.getObjectSegmentationList()) {
            if (shape.getBounds().width * shape.getBounds().height < 5) { // at least 2x2 pixel
                if (logger.isTraceEnabled())
                    logger.trace("skipping shape: " + shape.getBounds());
                classificationList[i] = -1;
                i++;
                continue;
            }
            int sampleSize = Math.min(3, rf.bimg.getImage().getSampleModel().getNumBands()); // was always 1 before! (max 3 because alpha should be ignored)
            double[] featsAll = new ObjectFeatureBuilderTiled(model).buildFeatures(shape, Double.NaN, rf,
                    rf.getClassImage(), sampleSize, 0, 0);
            double[] feats = Arrays.copyOfRange(featsAll, 0,
                    featsAll.length - ObjectFeatureBuilderTiled.SkipTailForClassification); // skip some non relevant attributes like centerX/Y
            inst = new DenseInstance(1.0d, feats);
            inst.setDataset(dataset);

            double classVal = Double.NaN;
            try {
                classVal = classifier.classifyInstance(inst);
            } catch (Exception e) {
                e.printStackTrace();
                classificationList[i] = -1;
                logger.error("The classifier is not a cell classification classifier.");
                return;
            }
            if (classVal != Double.NaN) {
                //if (logger.isTraceEnabled()) logger.trace("Obj "+i+": Class "+classVal);
                classificationList[i] = (int) classVal;
                if (!classCount.containsKey((int) classVal))
                    classCount.put((int) classVal, 0);
                classCount.put((int) classVal, classCount.get((int) classVal) + 1);
            } else {
                classificationList[i] = -1;
                logger.trace("Class is null");
            }
            i++;
        } // shape
        rf.setObjectClassificationList(classificationList); // not save!

        if (originalFrame != null) {
            if (rf.getObjectClassificationList() != null) {
                this.originalFrame.setObjectClassificationList(rf.getObjectClassificationList().clone());
            }
            this.originalFrame.setClassShapes(oldClassShapes);
            this.originalFrame.setClassImage(rf.getClassImage());
        }

        if (oldClassShapes != null) {
            rf.setClassShapes(oldClassShapes);
            logger.debug("ClassShape Workaround: Old classShapes set.");
        }

        if (generateTaskResult) {
            StringBuilder sb = new StringBuilder("Finished classifying " + i + " objects.\n");
            sb.append("Class Frequencies:\n");
            for (int c = 0; c < rf.getClassShapes().size(); c++) {
                if (classCount.containsKey(c)) {
                    int cnt = classCount.get(c);
                    String cName = rf.getClassShapes().get(c).getName();
                    sb.append(cName + ": " + cnt + "\n");
                } else {
                    int cnt = 0;
                    String cName = rf.getClassShapes().get(c).getName();
                    sb.append(cName + ": " + cnt + "\n");
                }
            }
            taskResult = new TaskResult("Cell Classification Result", sb.toString());
            logger.debug(sb.toString());
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.dhamacher.sentimentanalysis4tweets.preprocessing.TweetClassifier.java

License:Apache License

/**
 * Method that converts a text message into an instance.
 *
 * @param text the message content to convert
 * @param data the header information//from  w  w  w.ja v a 2s  .com
 * @return the generated Instance
 */
private Instance makeInstance(String text, Instances data) {
    Instance instance = new Instance(2);
    Attribute messageAtt = data.attribute("content");
    instance.setValue(messageAtt, messageAtt.addStringValue(text));
    instance.setDataset(data);
    return instance;
}

From source file:com.entopix.maui.filters.MauiFilter.java

License:Open Source License

/**
 * Converts an instance.//from w  w  w.  j a va  2 s.  c om
 */
private FastVector convertInstance(Instance instance, boolean training) {

    FastVector vector = new FastVector();

    String fileName = instance.stringValue(fileNameAtt);

    if (debugMode) {
        log.info("-- Converting instance for document " + fileName);
    }

    // Get the key phrases for the document
    HashMap<String, Counter> hashKeyphrases = null;

    if (!instance.isMissing(keyphrasesAtt)) {
        String keyphrases = instance.stringValue(keyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases);
    }

    // Get the document text
    String documentText = instance.stringValue(documentAtt);

    // Compute the candidate topics
    HashMap<String, Candidate> candidateList;
    if (allCandidates != null && allCandidates.containsKey(instance)) {
        candidateList = allCandidates.get(instance);
    } else {
        candidateList = getCandidates(documentText);
    }
    if (debugMode) {
        log.info(candidateList.size() + " candidates ");
    }

    // Set indices for key attributes
    int tfidfAttIndex = documentAtt + 2;
    int distAttIndex = documentAtt + 3;
    int probsAttIndex = documentAtt + numFeatures;

    int countPos = 0;
    int countNeg = 0;

    // Go through the phrases and convert them into instances
    for (Candidate candidate : candidateList.values()) {

        if (candidate.getFrequency() < minOccurFrequency) {
            continue;
        }

        String name = candidate.getName();
        String orig = candidate.getBestFullForm();
        if (!vocabularyName.equals("none")) {
            orig = candidate.getTitle();
        }

        double[] vals = computeFeatureValues(candidate, training, hashKeyphrases, candidateList);

        Instance inst = new Instance(instance.weight(), vals);

        inst.setDataset(classifierData);

        double[] probs = null;
        try {
            // Get probability of a phrase being key phrase
            probs = classifier.distributionForInstance(inst);
        } catch (Exception e) {
            log.error("Exception while getting probability for candidate " + candidate.getName());
            continue;
        }

        double prob = probs[0];
        if (nominalClassValue) {
            prob = probs[1];
        }

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures + 2];

        int pos = 0;
        for (int i = 1; i < instance.numAttributes(); i++) {

            if (i == documentAtt) {

                // output of values for a given phrase:

                // 0 Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(name);
                newInst[pos++] = index;

                // 1 Add original version
                if (orig != null) {
                    index = outputFormatPeek().attribute(pos).addStringValue(orig);
                } else {
                    index = outputFormatPeek().attribute(pos).addStringValue(name);
                }

                // 2
                newInst[pos++] = index;

                // Add features
                newInst[pos++] = inst.value(tfIndex); // 3
                newInst[pos++] = inst.value(idfIndex); // 4
                newInst[pos++] = inst.value(tfidfIndex); // 5
                newInst[pos++] = inst.value(firstOccurIndex); // 6
                newInst[pos++] = inst.value(lastOccurIndex); // 7
                newInst[pos++] = inst.value(spreadOccurIndex); // 8
                newInst[pos++] = inst.value(domainKeyphIndex); // 9
                newInst[pos++] = inst.value(lengthIndex); // 10 
                newInst[pos++] = inst.value(generalityIndex); // 11
                newInst[pos++] = inst.value(nodeDegreeIndex); // 12
                newInst[pos++] = inst.value(invWikipFreqIndex); // 13
                newInst[pos++] = inst.value(totalWikipKeyphrIndex); // 14
                newInst[pos++] = inst.value(wikipGeneralityIndex); // 15

                // Add probability
                probsAttIndex = pos;
                newInst[pos++] = prob; // 16

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue(); // 17

            } else if (i == keyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }

        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);

        if (inst.classValue() == 0) {
            countNeg++;
        } else {
            countPos++;
        }

    }
    if (debugMode) {
        log.info(countPos + " positive; " + countNeg + " negative instances");
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);

        // log.info(vals[i] + "\t" + currentInstance);

        // Short cut: if phrase very unlikely make rank very low and
        // continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current
        // phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        currentInstance.setValue(probsAttIndex + 1, rank++);

    }

    return vector;
}

From source file:com.esda.util.StringToWordVector.java

License:Open Source License

/**
 * Converts the instance w/o normalization.
 *
 * @oaram instance the instance to convert
 * @param v//from w w w. j  a v  a2 s .  co  m
 * @return the conerted instance
 */
private int convertInstancewoDocNorm(Instance instance, FastVector v) {

    // Convert the instance into a sorted set of indexes
    TreeMap contained = new TreeMap();

    // Copy all non-converted attributes from input to output
    int firstCopy = 0;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (!m_SelectedRange.isInRange(i)) {
            if (getInputFormat().attribute(i).type() != Attribute.STRING
                    && getInputFormat().attribute(i).type() != Attribute.RELATIONAL) {
                // Add simple nominal and numeric attributes directly
                if (instance.value(i) != 0.0) {
                    contained.put(new Integer(firstCopy), new Double(instance.value(i)));
                }
            } else {
                if (instance.isMissing(i)) {
                    contained.put(new Integer(firstCopy), new Double(Double.NaN));
                } else if (getInputFormat().attribute(i).type() == Attribute.STRING) {

                    // If this is a string attribute, we have to first add
                    // this value to the range of possible values, then add
                    // its new internal index.
                    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
                        // Note that the first string value in a
                        // SparseInstance doesn't get printed.
                        outputFormatPeek().attribute(firstCopy)
                                .addStringValue("Hack to defeat SparseInstance bug");
                    }
                    int newIndex = outputFormatPeek().attribute(firstCopy)
                            .addStringValue(instance.stringValue(i));
                    contained.put(new Integer(firstCopy), new Double(newIndex));
                } else {
                    // relational
                    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
                        Instances relationalHeader = outputFormatPeek().attribute(firstCopy).relation();

                        // hack to defeat sparse instances bug
                        outputFormatPeek().attribute(firstCopy).addRelation(relationalHeader);
                    }
                    int newIndex = outputFormatPeek().attribute(firstCopy)
                            .addRelation(instance.relationalValue(i));
                    contained.put(new Integer(firstCopy), new Double(newIndex));
                }
            }
            firstCopy++;
        }
    }

    for (int j = 0; j < instance.numAttributes(); j++) {
        // if ((getInputFormat().attribute(j).type() == Attribute.STRING)
        if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

            m_Tokenizer.tokenize(instance.stringValue(j));

            while (m_Tokenizer.hasMoreElements()) {
                String word = (String) m_Tokenizer.nextElement();
                if (this.m_lowerCaseTokens == true)
                    word = word.toLowerCase();
                word = m_Stemmer.stem(word);
                Integer index = (Integer) m_Dictionary.get(word);
                if (index != null) {
                    if (m_OutputCounts) { // Separate if here rather than
                        // two lines down to avoid
                        // hashtable lookup
                        Double count = (Double) contained.get(index);
                        if (count != null) {
                            contained.put(index, new Double(count.doubleValue() + 1.0));
                        } else {
                            contained.put(index, new Double(1));
                        }
                    } else {
                        contained.put(index, new Double(1));
                    }
                }
            }
        }
    }

    // Doing TFTransform
    if (m_TFTransform == true) {
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            if (index.intValue() >= firstCopy) {
                double val = ((Double) contained.get(index)).doubleValue();
                val = Math.log(val + 1);
                contained.put(index, new Double(val));
            }
        }
    }

    // Doing IDFTransform
    if (m_IDFTransform == true) {
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            if (index.intValue() >= firstCopy) {
                double val = ((Double) contained.get(index)).doubleValue();
                val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]);
                contained.put(index, new Double(val));
            }
        }
    }

    // Convert the set to structures needed to create a sparse instance.
    double[] values = new double[contained.size()];
    int[] indices = new int[contained.size()];
    Iterator it = contained.keySet().iterator();
    for (int i = 0; it.hasNext(); i++) {
        Integer index = (Integer) it.next();
        Double value = (Double) contained.get(index);
        values[i] = value.doubleValue();
        indices[i] = index.intValue();
    }

    Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes());
    inst.setDataset(outputFormatPeek());

    v.addElement(inst);

    return firstCopy;
}

From source file:com.openkm.kea.filter.KEAFilter.java

License:Open Source License

/**
 * Converts an instance.//from  www .  j a v a2  s .c o m
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        log.info("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap<String, Counter> hashKeyphrases = null;
    HashMap<String, Counter> hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap<String, FastVector> hash = new HashMap<String, FastVector>();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));
    //   hash = getComposits(hash);

    /* Experimental:
     To compute how many of the manual keyphrases appear in the documents:
            
    log.info("Doc phrases found " + hash.size());
    log.info("Manual keyphrases: ");
    Iterator iter = hashKeyphrases.keySet().iterator();
    int count = 0;
    while (iter.hasNext()) {
       String id = (String)iter.next();
       if (hash.containsKey(id)) {
    count++;
       }
    }
            
    double max_recall = (double)count/(double)hashKeyphrases.size();
            
            
    m_max_recall += max_recall;
    doc++;
    double avg_m_max_recall = m_max_recall/(double)doc;
            
    String file = instance.stringValue(2);
    log.info(count + " out of " + hashKeyphrases.size() + " are in the document ");
    log.info("Max recall : " + avg_m_max_recall + " on " + doc + " documents ");
    */

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }
    if (m_STDEVfeature) {
        numFeatures = numFeatures + 1;
    }
    if (m_NODEfeature) {
        numFeatures = numFeatures + 1;
    }
    if (m_LENGTHfeature) {
        numFeatures = numFeatures + 1;
    }

    // Set indices of key attributes
    //int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;
    //int classAttIndex = numFeatures;

    // Go through the phrases and convert them into instances
    Iterator<String> it = hash.keySet().iterator();
    while (it.hasNext()) {
        String id = it.next();
        FastVector phraseInfo = (FastVector) hash.get(id);

        double[] vals = featVals(id, phraseInfo, training, hashKeysEval, hashKeyphrases, length, hash);

        Instance inst = new Instance(instance.weight(), vals);

        inst.setDataset(m_ClassifierData);

        // Get probability of a phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);

        // If simple Naive Bayes used, change here to
        //double prob = probs[1];
        double prob = probs[0];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // output of values for a given phrase:

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(id);
                newInst[pos++] = index;

                // Add original version
                String orig = (String) phraseInfo.elementAt(2);

                if (orig != null) {
                    index = outputFormatPeek().attribute(pos).addStringValue(orig);
                } else {
                    index = outputFormatPeek().attribute(pos).addStringValue(id);
                }
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }
                if (m_STDEVfeature) {
                    newInst[pos++] = inst.value(m_STDEVIndex);
                }
                if (m_NODEfeature) {
                    newInst[pos++] = inst.value(m_NodeIndex);
                }
                if (m_LENGTHfeature) {
                    newInst[pos++] = inst.value(m_LengthIndex);
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();

            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator<String> phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {
                    // log.info("Here: " + phrase);
                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }
                    if (m_STDEVfeature) {
                        newInst[pos++] = Instance.missingValue();
                    }
                    if (m_NODEfeature) {
                        newInst[pos++] = Instance.missingValue();
                    }
                    if (m_LENGTHfeature) {
                        newInst[pos++] = Instance.missingValue();
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    // newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }

                Instance inst = new Instance(instance.weight(), newInst);
                inst.setDataset(outputFormatPeek());
                vector.addElement(inst);
            }

        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);
        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        currentInstance.setValue(probsAttIndex + 1, rank++);

    }
    return vector;
}

From source file:com.openkm.kea.filter.KEAPhraseFilter.java

License:Open Source License

/** 
 * Converts an instance by removing all non-alphanumeric characters
 * from its string attribute values./*from ww w. j a  v  a 2s. c o m*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if (!instance.attribute(i).isString() || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            if (!m_SelectCols.isInRange(i)) {
                int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i));
                instVals[i] = (double) index;
                continue;
            }
            // aly: str = text of the document
            String str = instance.stringValue(i);

            String tokenized = tokenize(str);

            // aly: resultStr is the clean version of str
            // log.info(resultStr.toString());
            int index = getOutputFormat().attribute(i).addStringValue(tokenized);
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:com.openkm.kea.filter.NumbersFilter.java

License:Open Source License

/** 
 * Converts an instance. A phrase boundary is inserted where
 * a number is found.// ww w.  ja v  a2s  .c om
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if ((!instance.attribute(i).isString()) || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            String str = instance.stringValue(i);

            StringBuffer resultStr = new StringBuffer();
            StringTokenizer tok = new StringTokenizer(str, " \t\n", true);
            while (tok.hasMoreTokens()) {
                String token = tok.nextToken();

                // Everything that doesn't contain at least
                // one letter is considered to be a number
                boolean isNumber = true;
                for (int j = 0; j < token.length(); j++) {
                    if (Character.isLetter(token.charAt(j))) {
                        isNumber = false;
                        break;
                    }
                }
                if (!isNumber) {
                    resultStr.append(token);
                } else {
                    if (token.equals(" ") || token.equals("\t") || token.equals("\n")) {
                        resultStr.append(token);
                    } else {
                        resultStr.append(" \n ");
                    }
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:com.rapidminer.tools.WekaInstancesAdaptor.java

License:Open Source License

/** Gets an example and creates a Weka instance. */
private Instance toWekaInstance(Example example) {
    int numberOfRegularValues = example.getAttributes().size();
    int numberOfValues = numberOfRegularValues + (labelAttribute != null ? 1 : 0);
    double[] values = new double[numberOfValues];

    // set regular attribute values
    if (taskType == ASSOCIATION_RULE_MINING) {
        int a = 0;
        for (Attribute attribute : exampleSet.getAttributes()) {
            double value = example.getValue(attribute);
            if (attribute.isNominal()) {
                if (value == mostFrequent[a])
                    value = Double.NaN;
                // sets the most frequent value to missing
                // for association learning
            }//from  w  w w .j a  v a2  s .  c  o m
            values[a] = value;
            a++;
        }
    } else {
        int[] nonDefaultIndices = exampleTransform.getNonDefaultAttributeIndices(example);
        double[] nonDefaultValues = exampleTransform.getNonDefaultAttributeValues(example, nonDefaultIndices);
        for (int a = 0; a < nonDefaultIndices.length; a++) {
            values[nonDefaultIndices[a]] = nonDefaultValues[a];
        }
    }

    // set label value if necessary
    switch (taskType) {
    case LEARNING:
        values[values.length - 1] = example.getValue(labelAttribute);
        break;
    case PREDICTING:
        values[values.length - 1] = Double.NaN;
        break;
    case WEIGHTING:
        if (labelAttribute != null)
            values[values.length - 1] = example.getValue(labelAttribute);
        break;
    default:
        break;
    }

    // get instance weight
    double weight = 1.0d;
    if (this.weightAttribute != null)
        weight = example.getValue(this.weightAttribute);

    // create new instance
    Instance instance = new Instance(weight, values);
    instance.setDataset(this);
    return instance;
}

From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java

License:Open Source License

/**
 * Returns class probability distribution for the given instance.
 * /*from   ww w .  j  a  v  a 2 s  .  c o m*/
 * @param instance the instance to be classified
 * @return the class probabilities
 * @throws Exception if an error occurred during the prediction
 */
@Override
public double[] distributionForInstance(Instance instance) throws Exception {

    if (m_ZeroR != null) {
        return m_ZeroR.distributionForInstance(instance);
    } else {
        double[] result = new double[instance.numClasses()];

        if (m_ActualClusterer != null) {
            // build new instance
            Instances tempData = m_ClusteringHeader.stringFreeStructure();
            double[] values = new double[tempData.numAttributes()];
            int n = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == instance.classIndex()) {
                    continue;
                }
                if (instance.attribute(i).isString()) {
                    values[n] = tempData.attribute(n).addStringValue(instance.stringValue(i));
                } else if (instance.attribute(i).isRelationValued()) {
                    values[n] = tempData.attribute(n).addRelation(instance.relationalValue(i));
                } else {
                    values[n] = instance.value(i);
                }
                n++;
            }
            Instance newInst = new DenseInstance(instance.weight(), values);
            newInst.setDataset(tempData);

            if (!getLabelAllClusters()) {

                // determine cluster/class
                double r = m_ClustersToClasses[m_ActualClusterer.clusterInstance(newInst)];
                if (r == -1) {
                    return result; // Unclassified
                } else {
                    result[(int) r] = 1.0;
                    return result;
                }
            } else {
                double[] classProbs = new double[instance.numClasses()];
                double[] dist = m_ActualClusterer.distributionForInstance(newInst);
                for (int i = 0; i < dist.length; i++) {
                    for (int j = 0; j < instance.numClasses(); j++) {
                        classProbs[j] += dist[i] * m_ClusterClassProbs[i][j];
                    }
                }
                Utils.normalize(classProbs);
                return classProbs;
            }
        } else {
            return result; // Unclassified
        }
    }
}

From source file:com.yahoo.labs.samoa.instances.SamoaToWekaInstanceConverter.java

License:Apache License

/**
* Weka instance.//from w  ww.  j  a v a 2 s.c o m
*
* @param inst the inst
* @return the weka.core. instance
*/
public weka.core.Instance wekaInstance(Instance inst) {
    weka.core.Instance wekaInstance;
    if (((InstanceImpl) inst).instanceData instanceof SparseInstanceData) {
        InstanceImpl instance = (InstanceImpl) inst;
        SparseInstanceData sparseInstanceData = (SparseInstanceData) instance.instanceData;
        wekaInstance = new weka.core.SparseInstance(instance.weight(), sparseInstanceData.getAttributeValues(),
                sparseInstanceData.getIndexValues(), sparseInstanceData.getNumberAttributes());
        /*if (this.wekaInstanceInformation == null) {
        this.wekaInstanceInformation = this.wekaInstancesInformation(inst.dataset());
        }
        wekaInstance.insertAttributeAt(inst.classIndex());
        wekaInstance.setDataset(wekaInstanceInformation);
        wekaInstance.setClassValue(inst.classValue());
        //wekaInstance.setValueSparse(wekaInstance.numAttributes(), inst.classValue());*/
    } else {
        Instance instance = inst;
        wekaInstance = new weka.core.DenseInstance(instance.weight(), instance.toDoubleArray());
        /* if (this.wekaInstanceInformation == null) {
        this.wekaInstanceInformation = this.wekaInstancesInformation(inst.dataset());
         }
         //We suppose that the class is the last attibute. We should deal when this is not the case
         wekaInstance.insertAttributeAt(inst.classIndex());
         wekaInstance.setDataset(wekaInstanceInformation);
         wekaInstance.setClassValue(inst.classValue());*/
    }
    if (this.wekaInstanceInformation == null) {
        this.wekaInstanceInformation = this.wekaInstancesInformation(inst.dataset());
    }
    //wekaInstance.insertAttributeAt(inst.classIndex());
    wekaInstance.setDataset(wekaInstanceInformation);
    if (inst.numOutputAttributes() == 1) {
        wekaInstance.setClassValue(inst.classValue());
    }

    return wekaInstance;
}