Example usage for weka.core Instance setDataset

List of usage examples for weka.core Instance setDataset

Introduction

In this page you can find the example usage for weka.core Instance setDataset.

Prototype

public void setDataset(Instances instances);

Source Link

Document

Sets the reference to the dataset.

Usage

From source file:com.actelion.research.orbit.imageAnalysis.tasks.ObjectClassificationWorker.java

License:Open Source License

@Override
protected void doWork() {
    try {//from  w ww  . j  a  v  a 2s .  co  m
        if (rf.getObjectSegmentationList() == null) {
            logger.error("objectSegmentation is null. Please create a objectSegmentationList first.");
            return;
        }

        if (rf.getClassImage() == null) {
            logger.error("classImage is null. Please do a tissue classification first.");
            return;
        }

        if (classifier == null) {
            logger.error("classifier is null, so classification cannot be done.");
            return;
        }

        classCount = new HashMap<Integer, Integer>();
        for (int idx = 0; idx < model.getClassShapes().size(); idx++) {
            classCount.put(idx, 0);
        }

        int[] classificationList = new int[rf.getObjectSegmentationList().size()];
        Instance inst;
        int i = 0;
        Rectangle bounds;
        boolean inTileList = true;
        logger.trace("starting the classification of " + rf.getObjectSegmentationList().size() + " cells");
        //logger.trace("classifier: "+classifier);
        //logger.trace("dataSet: "+dataset);
        List<Rectangle> tileBounds = null;
        if (tileList != null && tileList.size() > 0) {
            tileBounds = new ArrayList<Rectangle>(tileList.size());
            for (Point tile : tileList) {
                if (tile.x != -1 && tile.y != -1) { // all tiles marker
                    tileBounds.add(rf.bimg.getImage().getTileRect(tile.x, tile.y));
                }
            }
        }

        for (Shape shape : rf.getObjectSegmentationList()) {
            if (shape.getBounds().width * shape.getBounds().height < 5) { // at least 2x2 pixel
                if (logger.isTraceEnabled())
                    logger.trace("skipping shape: " + shape.getBounds());
                classificationList[i] = -1;
                i++;
                continue;
            }
            int sampleSize = Math.min(3, rf.bimg.getImage().getSampleModel().getNumBands()); // was always 1 before! (max 3 because alpha should be ignored)
            double[] featsAll = new ObjectFeatureBuilderTiled(model).buildFeatures(shape, Double.NaN, rf,
                    rf.getClassImage(), sampleSize, 0, 0);
            double[] feats = Arrays.copyOfRange(featsAll, 0,
                    featsAll.length - ObjectFeatureBuilderTiled.SkipTailForClassification); // skip some non relevant attributes like centerX/Y
            inst = new DenseInstance(1.0d, feats);
            inst.setDataset(dataset);

            double classVal = Double.NaN;
            try {
                classVal = classifier.classifyInstance(inst);
            } catch (Exception e) {
                e.printStackTrace();
                classificationList[i] = -1;
                logger.error("The classifier is not a cell classification classifier.");
                return;
            }
            if (classVal != Double.NaN) {
                //if (logger.isTraceEnabled()) logger.trace("Obj "+i+": Class "+classVal);
                classificationList[i] = (int) classVal;
                if (!classCount.containsKey((int) classVal))
                    classCount.put((int) classVal, 0);
                classCount.put((int) classVal, classCount.get((int) classVal) + 1);
            } else {
                classificationList[i] = -1;
                logger.trace("Class is null");
            }
            i++;
        } // shape
        rf.setObjectClassificationList(classificationList); // not save!

        if (originalFrame != null) {
            if (rf.getObjectClassificationList() != null) {
                this.originalFrame.setObjectClassificationList(rf.getObjectClassificationList().clone());
            }
            this.originalFrame.setClassShapes(oldClassShapes);
            this.originalFrame.setClassImage(rf.getClassImage());
        }

        if (oldClassShapes != null) {
            rf.setClassShapes(oldClassShapes);
            logger.debug("ClassShape Workaround: Old classShapes set.");
        }

        if (generateTaskResult) {
            StringBuilder sb = new StringBuilder("Finished classifying " + i + " objects.\n");
            sb.append("Class Frequencies:\n");
            for (int c = 0; c < rf.getClassShapes().size(); c++) {
                if (classCount.containsKey(c)) {
                    int cnt = classCount.get(c);
                    String cName = rf.getClassShapes().get(c).getName();
                    sb.append(cName + ": " + cnt + "\n");
                } else {
                    int cnt = 0;
                    String cName = rf.getClassShapes().get(c).getName();
                    sb.append(cName + ": " + cnt + "\n");
                }
            }
            taskResult = new TaskResult("Cell Classification Result", sb.toString());
            logger.debug(sb.toString());
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.dhamacher.sentimentanalysis4tweets.preprocessing.TweetClassifier.java

License:Apache License

/**
 * Method that converts a text message into an instance.
 *
 * @param text the message content to convert
 * @param data the header information//from  w  w  w.ja v a 2s  .com
 * @return the generated Instance
 */
private Instance makeInstance(String text, Instances data) {
    Instance instance = new Instance(2);
    Attribute messageAtt = data.attribute("content");
    instance.setValue(messageAtt, messageAtt.addStringValue(text));
    instance.setDataset(data);
    return instance;
}

From source file:com.entopix.maui.filters.MauiFilter.java

License:Open Source License

/**
 * Converts an instance.//from w  w  w.  j a va  2 s.  c om
 */
private FastVector convertInstance(Instance instance, boolean training) {

    FastVector vector = new FastVector();

    String fileName = instance.stringValue(fileNameAtt);

    if (debugMode) {
        log.info("-- Converting instance for document " + fileName);
    }

    // Get the key phrases for the document
    HashMap<String, Counter> hashKeyphrases = null;

    if (!instance.isMissing(keyphrasesAtt)) {
        String keyphrases = instance.stringValue(keyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases);
    }

    // Get the document text
    String documentText = instance.stringValue(documentAtt);

    // Compute the candidate topics
    HashMap<String, Candidate> candidateList;
    if (allCandidates != null && allCandidates.containsKey(instance)) {
        candidateList = allCandidates.get(instance);
    } else {
        candidateList = getCandidates(documentText);
    }
    if (debugMode) {
        log.info(candidateList.size() + " candidates ");
    }

    // Set indices for key attributes
    int tfidfAttIndex = documentAtt + 2;
    int distAttIndex = documentAtt + 3;
    int probsAttIndex = documentAtt + numFeatures;

    int countPos = 0;
    int countNeg = 0;

    // Go through the phrases and convert them into instances
    for (Candidate candidate : candidateList.values()) {

        if (candidate.getFrequency() < minOccurFrequency) {
            continue;
        }

        String name = candidate.getName();
        String orig = candidate.getBestFullForm();
        if (!vocabularyName.equals("none")) {
            orig = candidate.getTitle();
        }

        double[] vals = computeFeatureValues(candidate, training, hashKeyphrases, candidateList);

        Instance inst = new Instance(instance.weight(), vals);

        inst.setDataset(classifierData);

        double[] probs = null;
        try {
            // Get probability of a phrase being key phrase
            probs = classifier.distributionForInstance(inst);
        } catch (Exception e) {
            log.error("Exception while getting probability for candidate " + candidate.getName());
            continue;
        }

        double prob = probs[0];
        if (nominalClassValue) {
            prob = probs[1];
        }

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures + 2];

        int pos = 0;
        for (int i = 1; i < instance.numAttributes(); i++) {

            if (i == documentAtt) {

                // output of values for a given phrase:

                // 0 Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(name);
                newInst[pos++] = index;

                // 1 Add original version
                if (orig != null) {
                    index = outputFormatPeek().attribute(pos).addStringValue(orig);
                } else {
                    index = outputFormatPeek().attribute(pos).addStringValue(name);
                }

                // 2
                newInst[pos++] = index;

                // Add features
                newInst[pos++] = inst.value(tfIndex); // 3
                newInst[pos++] = inst.value(idfIndex); // 4
                newInst[pos++] = inst.value(tfidfIndex); // 5
                newInst[pos++] = inst.value(firstOccurIndex); // 6
                newInst[pos++] = inst.value(lastOccurIndex); // 7
                newInst[pos++] = inst.value(spreadOccurIndex); // 8
                newInst[pos++] = inst.value(domainKeyphIndex); // 9
                newInst[pos++] = inst.value(lengthIndex); // 10 
                newInst[pos++] = inst.value(generalityIndex); // 11
                newInst[pos++] = inst.value(nodeDegreeIndex); // 12
                newInst[pos++] = inst.value(invWikipFreqIndex); // 13
                newInst[pos++] = inst.value(totalWikipKeyphrIndex); // 14
                newInst[pos++] = inst.value(wikipGeneralityIndex); // 15

                // Add probability
                probsAttIndex = pos;
                newInst[pos++] = prob; // 16

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue(); // 17

            } else if (i == keyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }

        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);

        if (inst.classValue() == 0) {
            countNeg++;
        } else {
            countPos++;
        }

    }
    if (debugMode) {
        log.info(countPos + " positive; " + countNeg + " negative instances");
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);

        // log.info(vals[i] + "\t" + currentInstance);

        // Short cut: if phrase very unlikely make rank very low and
        // continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current
        // phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        currentInstance.setValue(probsAttIndex + 1, rank++);

    }

    return vector;
}

From source file:com.esda.util.StringToWordVector.java

License:Open Source License

/**
 * Converts the instance w/o normalization.
 *
 * @oaram instance the instance to convert
 * @param v//from w w w. j  a v  a2 s .  co  m
 * @return the conerted instance
 */
private int convertInstancewoDocNorm(Instance instance, FastVector v) {

    // Convert the instance into a sorted set of indexes
    TreeMap contained = new TreeMap();

    // Copy all non-converted attributes from input to output
    int firstCopy = 0;
    for (int i = 0; i < getInputFormat().numAttributes(); i++) {
        if (!m_SelectedRange.isInRange(i)) {
            if (getInputFormat().attribute(i).type() != Attribute.STRING
                    && getInputFormat().attribute(i).type() != Attribute.RELATIONAL) {
                // Add simple nominal and numeric attributes directly
                if (instance.value(i) != 0.0) {
                    contained.put(new Integer(firstCopy), new Double(instance.value(i)));
                }
            } else {
                if (instance.isMissing(i)) {
                    contained.put(new Integer(firstCopy), new Double(Double.NaN));
                } else if (getInputFormat().attribute(i).type() == Attribute.STRING) {

                    // If this is a string attribute, we have to first add
                    // this value to the range of possible values, then add
                    // its new internal index.
                    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
                        // Note that the first string value in a
                        // SparseInstance doesn't get printed.
                        outputFormatPeek().attribute(firstCopy)
                                .addStringValue("Hack to defeat SparseInstance bug");
                    }
                    int newIndex = outputFormatPeek().attribute(firstCopy)
                            .addStringValue(instance.stringValue(i));
                    contained.put(new Integer(firstCopy), new Double(newIndex));
                } else {
                    // relational
                    if (outputFormatPeek().attribute(firstCopy).numValues() == 0) {
                        Instances relationalHeader = outputFormatPeek().attribute(firstCopy).relation();

                        // hack to defeat sparse instances bug
                        outputFormatPeek().attribute(firstCopy).addRelation(relationalHeader);
                    }
                    int newIndex = outputFormatPeek().attribute(firstCopy)
                            .addRelation(instance.relationalValue(i));
                    contained.put(new Integer(firstCopy), new Double(newIndex));
                }
            }
            firstCopy++;
        }
    }

    for (int j = 0; j < instance.numAttributes(); j++) {
        // if ((getInputFormat().attribute(j).type() == Attribute.STRING)
        if (m_SelectedRange.isInRange(j) && (instance.isMissing(j) == false)) {

            m_Tokenizer.tokenize(instance.stringValue(j));

            while (m_Tokenizer.hasMoreElements()) {
                String word = (String) m_Tokenizer.nextElement();
                if (this.m_lowerCaseTokens == true)
                    word = word.toLowerCase();
                word = m_Stemmer.stem(word);
                Integer index = (Integer) m_Dictionary.get(word);
                if (index != null) {
                    if (m_OutputCounts) { // Separate if here rather than
                        // two lines down to avoid
                        // hashtable lookup
                        Double count = (Double) contained.get(index);
                        if (count != null) {
                            contained.put(index, new Double(count.doubleValue() + 1.0));
                        } else {
                            contained.put(index, new Double(1));
                        }
                    } else {
                        contained.put(index, new Double(1));
                    }
                }
            }
        }
    }

    // Doing TFTransform
    if (m_TFTransform == true) {
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            if (index.intValue() >= firstCopy) {
                double val = ((Double) contained.get(index)).doubleValue();
                val = Math.log(val + 1);
                contained.put(index, new Double(val));
            }
        }
    }

    // Doing IDFTransform
    if (m_IDFTransform == true) {
        Iterator it = contained.keySet().iterator();
        for (int i = 0; it.hasNext(); i++) {
            Integer index = (Integer) it.next();
            if (index.intValue() >= firstCopy) {
                double val = ((Double) contained.get(index)).doubleValue();
                val = val * Math.log(m_NumInstances / (double) m_DocsCounts[index.intValue()]);
                contained.put(index, new Double(val));
            }
        }
    }

    // Convert the set to structures needed to create a sparse instance.
    double[] values = new double[contained.size()];
    int[] indices = new int[contained.size()];
    Iterator it = contained.keySet().iterator();
    for (int i = 0; it.hasNext(); i++) {
        Integer index = (Integer) it.next();
        Double value = (Double) contained.get(index);
        values[i] = value.doubleValue();
        indices[i] = index.intValue();
    }

    Instance inst = new SparseInstance(instance.weight(), values, indices, outputFormatPeek().numAttributes());
    inst.setDataset(outputFormatPeek());

    v.addElement(inst);

    return firstCopy;
}

From source file:com.openkm.kea.filter.KEAFilter.java

License:Open Source License

/**
 * Converts an instance.//from  www .  j a v a2  s .c o m
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        log.info("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap<String, Counter> hashKeyphrases = null;
    HashMap<String, Counter> hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap<String, FastVector> hash = new HashMap<String, FastVector>();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));
    //   hash = getComposits(hash);

    /* Experimental:
     To compute how many of the manual keyphrases appear in the documents:
            
    log.info("Doc phrases found " + hash.size());
    log.info("Manual keyphrases: ");
    Iterator iter = hashKeyphrases.keySet().iterator();
    int count = 0;
    while (iter.hasNext()) {
       String id = (String)iter.next();
       if (hash.containsKey(id)) {
    count++;
       }
    }
            
    double max_recall = (double)count/(double)hashKeyphrases.size();
            
            
    m_max_recall += max_recall;
    doc++;
    double avg_m_max_recall = m_max_recall/(double)doc;
            
    String file = instance.stringValue(2);
    log.info(count + " out of " + hashKeyphrases.size() + " are in the document ");
    log.info("Max recall : " + avg_m_max_recall + " on " + doc + " documents ");
    */

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }
    if (m_STDEVfeature) {
        numFeatures = numFeatures + 1;
    }
    if (m_NODEfeature) {
        numFeatures = numFeatures + 1;
    }
    if (m_LENGTHfeature) {
        numFeatures = numFeatures + 1;
    }

    // Set indices of key attributes
    //int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;
    //int classAttIndex = numFeatures;

    // Go through the phrases and convert them into instances
    Iterator<String> it = hash.keySet().iterator();
    while (it.hasNext()) {
        String id = it.next();
        FastVector phraseInfo = (FastVector) hash.get(id);

        double[] vals = featVals(id, phraseInfo, training, hashKeysEval, hashKeyphrases, length, hash);

        Instance inst = new Instance(instance.weight(), vals);

        inst.setDataset(m_ClassifierData);

        // Get probability of a phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);

        // If simple Naive Bayes used, change here to
        //double prob = probs[1];
        double prob = probs[0];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // output of values for a given phrase:

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(id);
                newInst[pos++] = index;

                // Add original version
                String orig = (String) phraseInfo.elementAt(2);

                if (orig != null) {
                    index = outputFormatPeek().attribute(pos).addStringValue(orig);
                } else {
                    index = outputFormatPeek().attribute(pos).addStringValue(id);
                }
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }
                if (m_STDEVfeature) {
                    newInst[pos++] = inst.value(m_STDEVIndex);
                }
                if (m_NODEfeature) {
                    newInst[pos++] = inst.value(m_NodeIndex);
                }
                if (m_LENGTHfeature) {
                    newInst[pos++] = inst.value(m_LengthIndex);
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();

            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator<String> phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {
                    // log.info("Here: " + phrase);
                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }
                    if (m_STDEVfeature) {
                        newInst[pos++] = Instance.missingValue();
                    }
                    if (m_NODEfeature) {
                        newInst[pos++] = Instance.missingValue();
                    }
                    if (m_LENGTHfeature) {
                        newInst[pos++] = Instance.missingValue();
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    // newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }

                Instance inst = new Instance(instance.weight(), newInst);
                inst.setDataset(outputFormatPeek());
                vector.addElement(inst);
            }

        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);
        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        currentInstance.setValue(probsAttIndex + 1, rank++);

    }
    return vector;
}

From source file:com.openkm.kea.filter.KEAPhraseFilter.java

License:Open Source License

/** 
 * Converts an instance by removing all non-alphanumeric characters
 * from its string attribute values./*from ww w. j a  v  a 2s. c o m*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if (!instance.attribute(i).isString() || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            if (!m_SelectCols.isInRange(i)) {
                int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i));
                instVals[i] = (double) index;
                continue;
            }
            // aly: str = text of the document
            String str = instance.stringValue(i);

            String tokenized = tokenize(str);

            // aly: resultStr is the clean version of str
            // log.info(resultStr.toString());
            int index = getOutputFormat().attribute(i).addStringValue(tokenized);
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:com.openkm.kea.filter.NumbersFilter.java

License:Open Source License

/** 
 * Converts an instance. A phrase boundary is inserted where
 * a number is found.// ww w.  ja v  a2s  .c om
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if ((!instance.attribute(i).isString()) || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            String str = instance.stringValue(i);

            StringBuffer resultStr = new StringBuffer();
            StringTokenizer tok = new StringTokenizer(str, " \t\n", true);
            while (tok.hasMoreTokens()) {
                String token = tok.nextToken();

                // Everything that doesn't contain at least
                // one letter is considered to be a number
                boolean isNumber = true;
                for (int j = 0; j < token.length(); j++) {
                    if (Character.isLetter(token.charAt(j))) {
                        isNumber = false;
                        break;
                    }
                }
                if (!isNumber) {
                    resultStr.append(token);
                } else {
                    if (token.equals(" ") || token.equals("\t") || token.equals("\n")) {
                        resultStr.append(token);
                    } else {
                        resultStr.append(" \n ");
                    }
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:com.rapidminer.tools.WekaInstancesAdaptor.java

License:Open Source License

/** Gets an example and creates a Weka instance. */
private Instance toWekaInstance(Example example) {
    int numberOfRegularValues = example.getAttributes().size();
    int numberOfValues = numberOfRegularValues + (labelAttribute != null ? 1 : 0);
    double[] values = new double[numberOfValues];

    // set regular attribute values
    if (taskType == ASSOCIATION_RULE_MINING) {
        int a = 0;
        for (Attribute attribute : exampleSet.getAttributes()) {
            double value = example.getValue(attribute);
            if (attribute.isNominal()) {
                if (value == mostFrequent[a])
                    value = Double.NaN;
                // sets the most frequent value to missing
                // for association learning
            }//from  w  w w .j a  v a2  s .  c  o m
            values[a] = value;
            a++;
        }
    } else {
        int[] nonDefaultIndices = exampleTransform.getNonDefaultAttributeIndices(example);
        double[] nonDefaultValues = exampleTransform.getNonDefaultAttributeValues(example, nonDefaultIndices);
        for (int a = 0; a < nonDefaultIndices.length; a++) {
            values[nonDefaultIndices[a]] = nonDefaultValues[a];
        }
    }

    // set label value if necessary
    switch (taskType) {
    case LEARNING:
        values[values.length - 1] = example.getValue(labelAttribute);
        break;
    case PREDICTING:
        values[values.length - 1] = Double.NaN;
        break;
    case WEIGHTING:
        if (labelAttribute != null)
            values[values.length - 1] = example.getValue(labelAttribute);
        break;
    default:
        break;
    }

    // get instance weight
    double weight = 1.0d;
    if (this.weightAttribute != null)
        weight = example.getValue(this.weightAttribute);

    // create new instance
    Instance instance = new Instance(weight, values);
    instance.setDataset(this);
    return instance;
}

From source file:com.spread.experiment.tempuntilofficialrelease.ClassificationViaClustering108.java

License:Open Source License

/**
 * Returns class probability distribution for the given instance.
 * /*from   ww w .  j  a  v  a 2 s  .  c o m*/
 * @param instance the instance to be classified
 * @return the class probabilities
 * @throws Exception if an error occurred during the prediction
 */
@Override
public double[] distributionForInstance(Instance instance) throws Exception {

    if (m_ZeroR != null) {
        return m_ZeroR.distributionForInstance(instance);
    } else {
        double[] result = new double[instance.numClasses()];

        if (m_ActualClusterer != null) {
            // build new instance
            Instances tempData = m_ClusteringHeader.stringFreeStructure();
            double[] values = new double[tempData.numAttributes()];
            int n = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == instance.classIndex()) {
                    continue;
                }
                if (instance.attribute(i).isString()) {
                    values[n] = tempData.attribute(n).addStringValue(instance.stringValue(i));
                } else if (instance.attribute(i).isRelationValued()) {
                    values[n] = tempData.attribute(n).addRelation(instance.relationalValue(i));
                } else {
                    values[n] = instance.value(i);
                }
                n++;
            }
            Instance newInst = new DenseInstance(instance.weight(), values);
            newInst.setDataset(tempData);

            if (!getLabelAllClusters()) {

                // determine cluster/class
                double r = m_ClustersToClasses[m_ActualClusterer.clusterInstance(newInst)];
                if (r == -1) {
                    return result; // Unclassified
                } else {
                    result[(int) r] = 1.0;
                    return result;
                }
            } else {
                double[] classProbs = new double[instance.numClasses()];
                double[] dist = m_ActualClusterer.distributionForInstance(newInst);
                for (int i = 0; i < dist.length; i++) {
                    for (int j = 0; j < instance.numClasses(); j++) {
                        classProbs[j] += dist[i] * m_ClusterClassProbs[i][j];
                    }
                }
                Utils.normalize(classProbs);
                return classProbs;
            }
        } else {
            return result; // Unclassified
        }
    }
}

From source file:com.yahoo.labs.samoa.instances.SamoaToWekaInstanceConverter.java

License:Apache License

/**
* Weka instance.//from w  ww.  j  a v a 2 s.c o m
*
* @param inst the inst
* @return the weka.core. instance
*/
public weka.core.Instance wekaInstance(Instance inst) {
    weka.core.Instance wekaInstance;
    if (((InstanceImpl) inst).instanceData instanceof SparseInstanceData) {
        InstanceImpl instance = (InstanceImpl) inst;
        SparseInstanceData sparseInstanceData = (SparseInstanceData) instance.instanceData;
        wekaInstance = new weka.core.SparseInstance(instance.weight(), sparseInstanceData.getAttributeValues(),
                sparseInstanceData.getIndexValues(), sparseInstanceData.getNumberAttributes());
        /*if (this.wekaInstanceInformation == null) {
        this.wekaInstanceInformation = this.wekaInstancesInformation(inst.dataset());
        }
        wekaInstance.insertAttributeAt(inst.classIndex());
        wekaInstance.setDataset(wekaInstanceInformation);
        wekaInstance.setClassValue(inst.classValue());
        //wekaInstance.setValueSparse(wekaInstance.numAttributes(), inst.classValue());*/
    } else {
        Instance instance = inst;
        wekaInstance = new weka.core.DenseInstance(instance.weight(), instance.toDoubleArray());
        /* if (this.wekaInstanceInformation == null) {
        this.wekaInstanceInformation = this.wekaInstancesInformation(inst.dataset());
         }
         //We suppose that the class is the last attibute. We should deal when this is not the case
         wekaInstance.insertAttributeAt(inst.classIndex());
         wekaInstance.setDataset(wekaInstanceInformation);
         wekaInstance.setClassValue(inst.classValue());*/
    }
    if (this.wekaInstanceInformation == null) {
        this.wekaInstanceInformation = this.wekaInstancesInformation(inst.dataset());
    }
    //wekaInstance.insertAttributeAt(inst.classIndex());
    wekaInstance.setDataset(wekaInstanceInformation);
    if (inst.numOutputAttributes() == 1) {
        wekaInstance.setClassValue(inst.classValue());
    }

    return wekaInstance;
}