Example usage for weka.core Instances attribute

List of usage examples for weka.core Instances attribute

Introduction

In this page you can find the example usage for weka.core Instances attribute.

Prototype

publicAttribute attribute(String name) 

Source Link

Document

Returns an attribute given its name.

Usage

From source file:mao.datamining.DataSetPair.java

/**
 * To drop the useless columns accordingly on the test dataset, if it exists
*//*from ww  w  .  j av a  2 s . c  om*/
private void processTestDataSet() {
    if (!new File(testSourceFileName).exists())
        return;

    try {
        Instances orangeTestDataSet = ConverterUtils.DataSource.read(testSourceFileName);
        Remove remove = new Remove();
        StringBuilder indexBuffer = new StringBuilder();
        for (String attrName : finalTrainAttrList) {
            int attrIndex = orangeTestDataSet.attribute(attrName).index();
            indexBuffer.append(attrIndex + 1).append(",");
        }
        Main.logging("Attribute Indices: \n" + indexBuffer.toString());
        remove.setAttributeIndices(indexBuffer.toString());
        remove.setInvertSelection(true);

        remove.setInputFormat(orangeTestDataSet);
        Instances testNewDataSet = Filter.useFilter(orangeTestDataSet, remove);

        try (BufferedWriter writer = new BufferedWriter(
                new OutputStreamWriter(new FileOutputStream(this.testFileName)))) {
            writer.write(testNewDataSet.toString());
        }

        //set the final test dataset
        finalTestDataSet = testNewDataSet;
        finalTestDataSet.setClassIndex(finalTestDataSet.numAttributes() - 1);
        Main.logging("test dataset class attr: " + finalTestDataSet.classAttribute().toString());
    } catch (Exception e) {
        Main.logging(null, e);
    }
}

From source file:mao.datamining.RemoveUselessColumnsByMissingValues.java

License:Open Source License

/**
 * Signify that this batch of input to the filter is finished.
 *
 * @return true if there are instances pending output
 * @throws Exception if no input format defined
 *//*from w  w w  .  j a  v a2s  . c o  m*/
public boolean batchFinished() throws Exception {

    if (getInputFormat() == null) {
        throw new IllegalStateException("No input instance format defined");
    }
    if (m_removeFilter == null) {

        // establish attributes to remove from first batch

        Instances toFilter = getInputFormat();
        int[] attsToDelete = new int[toFilter.numAttributes()];
        int numToDelete = 0;
        for (int i = 0; i < toFilter.numAttributes(); i++) {
            if (i == toFilter.classIndex())
                continue; // skip class
            AttributeStats stats = toFilter.attributeStats(i);

            //remove those attributes who has high ratio of missing values
            if ((stats.missingCount * 100) / stats.totalCount > m_maxMissingPercentage) {
                //            System.out.println("stats.missingPercentage: " + (stats.missingCount*100)/stats.totalCount+"%");            
                attsToDelete[numToDelete++] = i;
            }
            //remove those columns defined in the list by manual check
            if (this.column2DeleteSet.contains(toFilter.attribute(i).name())) {
                attsToDelete[numToDelete++] = i;
            }
        }

        int[] finalAttsToDelete = new int[numToDelete];
        System.arraycopy(attsToDelete, 0, finalAttsToDelete, 0, numToDelete);

        m_removeFilter = new Remove();
        m_removeFilter.setAttributeIndicesArray(finalAttsToDelete);
        m_removeFilter.setInvertSelection(false);
        m_removeFilter.setInputFormat(toFilter);

        for (int i = 0; i < toFilter.numInstances(); i++) {
            m_removeFilter.input(toFilter.instance(i));
        }
        m_removeFilter.batchFinished();

        Instance processed;
        Instances outputDataset = m_removeFilter.getOutputFormat();

        // restore old relation name to hide attribute filter stamp
        outputDataset.setRelationName(toFilter.relationName());

        setOutputFormat(outputDataset);
        while ((processed = m_removeFilter.output()) != null) {
            processed.setDataset(outputDataset);
            push(processed);
        }
    }
    flushInput();

    m_NewBatch = true;
    return (numPendingOutput() != 0);
}

From source file:mao.datamining.Util.java

/**
* Based on the defined list of attributes, transform them into nominal from numeric type
* weka.filters.unsupervised.attribute.NumericToNominal -R first-last
* @param newData/*from   ww  w . j ava2s  .  c  o m*/
* @param columns2Nominal
* @return 
*/
public static Instances transformNum2Nominal(Instances newData, String[] columns2Nominal) {
    StringBuilder indexArrayStr = new StringBuilder();
    for (int i = 0; i < columns2Nominal.length; i++) {
        String attrName = columns2Nominal[i];
        Attribute attr = newData.attribute(attrName);
        if (attr != null) {
            indexArrayStr.append(attr.index() + 1).append(",");
        }
    }
    try {
        NumericToNominal transform = new NumericToNominal();
        transform.setInputFormat(newData);
        transform.setAttributeIndices(indexArrayStr.toString());
        newData = Filter.useFilter(newData, transform);
    } catch (Exception e) {
        Main.logging(null, e);
    }
    //        Main.logging("== New Data after transforming numeric data : ===\n" + newData.toSummaryString());
    return newData;
}

From source file:marytts.tools.voiceimport.PauseDurationTrainer.java

License:Open Source License

private Instance createInstance(Instances data, FeatureDefinition fd, FeatureVector fv) {
    // relevant features + one target
    Instance currInst = new DenseInstance(data.numAttributes());
    currInst.setDataset(data);/*from  ww  w.ja  v a 2 s .  c  o  m*/

    // read only relevant features
    for (String attName : this.featureNames) {
        int featNr = fd.getFeatureIndex(attName);

        String value = fv.getFeatureAsString(featNr, fd);
        currInst.setValue(data.attribute(attName), value);
    }

    return currInst;
}

From source file:maui.main.MauiModelBuilder.java

License:Open Source License

/**
 * Builds the model from the training data
 *//*w w w .  j  a v  a2  s .c  o m*/
public void buildModel(HashSet<String> fileNames, VocabularyStore store) throws Exception {

    // Check whether there is actually any data
    if (fileNames.size() == 0) {
        throw new Exception("Couldn't find any data in " + inputDirectoryName);
    }

    System.err.println("-- Building the model... ");

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("filename", (FastVector) null));
    atts.addElement(new Attribute("document", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Build model
    mauiFilter = new MauiFilter();

    mauiFilter.setDebug(getDebug());
    mauiFilter.setMaxPhraseLength(getMaxPhraseLength());
    mauiFilter.setMinPhraseLength(getMinPhraseLength());
    mauiFilter.setMinNumOccur(getMinNumOccur());
    mauiFilter.setStemmer(getStemmer());
    mauiFilter.setDocumentLanguage(getDocumentLanguage());
    mauiFilter.setVocabularyName(getVocabularyName());
    mauiFilter.setVocabularyFormat(getVocabularyFormat());
    mauiFilter.setStopwords(getStopwords());

    if (wikipedia != null) {
        mauiFilter.setWikipedia(wikipedia);
    } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) {
        mauiFilter.setWikipedia(wikipedia);
    } else {
        mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory);
    }

    if (classifier != null) {
        mauiFilter.setClassifier(classifier);
    }

    mauiFilter.setInputFormat(data);

    // set features configurations
    mauiFilter.setBasicFeatures(useBasicFeatures);
    mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature);
    mauiFilter.setFrequencyFeatures(useFrequencyFeatures);
    mauiFilter.setPositionsFeatures(usePositionsFeatures);
    mauiFilter.setLengthFeature(useLengthFeature);
    mauiFilter.setThesaurusFeatures(useNodeDegreeFeature);
    mauiFilter.setBasicWikipediaFeatures(useBasicWikipediaFeatures);
    mauiFilter.setAllWikipediaFeatures(useAllWikipediaFeatures);
    mauiFilter.setThesaurusFeatures(useNodeDegreeFeature);

    mauiFilter.setClassifier(classifier);

    mauiFilter.setContextSize(contextSize);
    mauiFilter.setMinKeyphraseness(minKeyphraseness);
    mauiFilter.setMinSenseProbability(minSenseProbability);

    if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) {
        mauiFilter.loadThesaurus(getStemmer(), getStopwords(), store);
    }

    System.err.println("-- Reading the input documents... ");

    for (String fileName : fileNames) {

        double[] newInst = new double[3];

        newInst[0] = (double) data.attribute(0).addStringValue(fileName);
        ;

        File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt");
        File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key");

        try {

            InputStreamReader is;
            if (!documentEncoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding);
            } else {
                is = new InputStreamReader(new FileInputStream(documentTextFile));
            }

            // Reading the file content
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            is.close();

            // Adding the text of the document to the instance
            newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString());

        } catch (Exception e) {

            System.err.println("Problem with reading " + documentTextFile);
            e.printStackTrace();
            newInst[1] = Instance.missingValue();
        }

        try {

            InputStreamReader is;
            if (!documentEncoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding);
            } else {
                is = new InputStreamReader(new FileInputStream(documentTopicsFile));
            }

            // Reading the content of the keyphrase file
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            // Adding the topics to the file
            newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString());

        } catch (Exception e) {

            System.err.println("Problem with reading " + documentTopicsFile);
            e.printStackTrace();
            newInst[2] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        mauiFilter.input(data.instance(0));
        data = data.stringFreeStructure();
    }
    mauiFilter.batchFinished();

    while ((mauiFilter.output()) != null) {
    }
    ;
}

From source file:maui.main.MauiTopicExtractor.java

License:Open Source License

/**
 * Builds the model from the files//from  w w  w .  j a v  a 2  s. c om
 */
public void extractKeyphrases(HashSet<String> fileNames, VocabularyStore store) throws Exception {

    // Check whether there is actually any data
    if (fileNames.size() == 0) {
        throw new Exception("Couldn't find any data in " + inputDirectoryName);
    }

    mauiFilter.setVocabularyName(getVocabularyName());
    mauiFilter.setVocabularyFormat(getVocabularyFormat());
    mauiFilter.setDocumentLanguage(getDocumentLanguage());
    mauiFilter.setStemmer(getStemmer());
    mauiFilter.setStopwords(getStopwords());
    if (wikipedia != null) {
        mauiFilter.setWikipedia(wikipedia);
    } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) {
        mauiFilter.setWikipedia(wikipedia);
    } else {
        mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory);
    }
    if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) {
        mauiFilter.loadThesaurus(getStemmer(), getStopwords(), store);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("filename", (FastVector) null));
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    System.err.println("-- Extracting keyphrases... ");

    Vector<Double> correctStatistics = new Vector<Double>();
    Vector<Double> precisionStatistics = new Vector<Double>();
    Vector<Double> recallStatistics = new Vector<Double>();

    for (String fileName : fileNames) {

        double[] newInst = new double[3];

        newInst[0] = (double) data.attribute(0).addStringValue(fileName);
        ;

        File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt");
        File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key");

        try {

            InputStreamReader is;
            if (!documentEncoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding);
            } else {
                is = new InputStreamReader(new FileInputStream(documentTextFile));
            }

            // Reading the file content
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            is.close();

            // Adding the text of the document to the instance
            newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString());

        } catch (Exception e) {
            System.err.println("Problem with reading " + documentTextFile);
            e.printStackTrace();
            newInst[1] = Instance.missingValue();
        }

        try {

            InputStreamReader is;
            if (!documentEncoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding);
            } else {
                is = new InputStreamReader(new FileInputStream(documentTopicsFile));
            }

            // Reading the content of the keyphrase file
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            // Adding the topics to the file
            newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString());

        } catch (Exception e) {
            if (debugMode) {
                System.err.println("No existing topics for " + documentTextFile);
            }
            newInst[2] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        mauiFilter.input(data.instance(0));

        data = data.stringFreeStructure();
        if (debugMode) {
            System.err.println("-- Processing document: " + fileName);
        }
        Instance[] topRankedInstances = new Instance[topicsPerDocument];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = mauiFilter.output()) != null) {

            int index = (int) inst.value(mauiFilter.getRankIndex()) - 1;

            if (index < topicsPerDocument) {
                topRankedInstances[index] = inst;
            }
        }

        if (debugMode) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;

        if (!documentTopicsFile.exists()) {
            out = new FileOutputStream(documentTopicsFile);
            if (!documentEncoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, documentEncoding));
            } else {
                printer = new PrintWriter(out);
            }
        }

        double numExtracted = 0, numCorrect = 0;
        wikipedia = mauiFilter.getWikipedia();

        HashMap<Article, Integer> topics = null;

        if (printGraph) {
            topics = new HashMap<Article, Integer>();
        }

        int p = 0;
        String root = "";
        for (int i = 0; i < topicsPerDocument; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    String topic = topRankedInstances[i].stringValue(mauiFilter.getOutputFormIndex());
                    printer.print(topic);

                    if (printGraph) {

                        Article article = wikipedia.getArticleByTitle(topic);
                        if (article == null) {
                            article = wikipedia.getMostLikelyArticle(topic, new CaseFolder());
                        }
                        if (article != null) {
                            if (root == "") {
                                root = article.getTitle();
                            }
                            topics.put(article, new Integer(p));
                        } else {
                            if (debugMode) {
                                System.err.println(
                                        "Couldn't find article for " + topic + " in " + documentTopicsFile);
                            }
                        }
                        p++;
                    }
                    if (additionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(mauiFilter.getNormalizedFormIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(mauiFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (debugMode) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }

        if (printGraph) {
            String graphFile = documentTopicsFile.getAbsolutePath().replace(".key", ".gv");
            computeGraph(topics, root, graphFile);
        }
        if (numExtracted > 0) {
            if (debugMode) {
                System.err.println("-- " + numCorrect + " correct");
            }
            double totalCorrect = mauiFilter.getTotalCorrect();
            correctStatistics.addElement(new Double(numCorrect));
            precisionStatistics.addElement(new Double(numCorrect / numExtracted));
            recallStatistics.addElement(new Double(numCorrect / totalCorrect));

        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }

    if (correctStatistics.size() != 0) {

        double[] st = new double[correctStatistics.size()];
        for (int i = 0; i < correctStatistics.size(); i++) {
            st[i] = correctStatistics.elementAt(i).doubleValue();
        }
        double avg = Utils.mean(st);
        double stdDev = Math.sqrt(Utils.variance(st));

        if (correctStatistics.size() == 1) {
            System.err.println("\n-- Evaluation results based on 1 document:");

        } else {
            System.err.println("\n-- Evaluation results based on " + correctStatistics.size() + " documents:");
        }
        System.err.println("Avg. number of correct keyphrases per document: " + Utils.doubleToString(avg, 2)
                + " +/- " + Utils.doubleToString(stdDev, 2));

        st = new double[precisionStatistics.size()];
        for (int i = 0; i < precisionStatistics.size(); i++) {
            st[i] = precisionStatistics.elementAt(i).doubleValue();
        }
        double avgPrecision = Utils.mean(st);
        double stdDevPrecision = Math.sqrt(Utils.variance(st));

        System.err.println("Precision: " + Utils.doubleToString(avgPrecision * 100, 2) + " +/- "
                + Utils.doubleToString(stdDevPrecision * 100, 2));

        st = new double[recallStatistics.size()];
        for (int i = 0; i < recallStatistics.size(); i++) {
            st[i] = recallStatistics.elementAt(i).doubleValue();
        }
        double avgRecall = Utils.mean(st);
        double stdDevRecall = Math.sqrt(Utils.variance(st));

        System.err.println("Recall: " + Utils.doubleToString(avgRecall * 100, 2) + " +/- "
                + Utils.doubleToString(stdDevRecall * 100, 2));

        double fMeasure = 2 * avgRecall * avgPrecision / (avgRecall + avgPrecision);
        System.err.println("F-Measure: " + Utils.doubleToString(fMeasure * 100, 2));

        System.err.println("");
    }
    mauiFilter.batchFinished();
}

From source file:meddle.PredictByDomainOS.java

License:Open Source License

public static boolean loadAllModels(String className) {
    domainOSModel = new HashMap<String, Classifier>();
    domainOSFeature = new HashMap<String, Map<String, Integer>>();
    domainOSStruct = new HashMap<String, Instances>();
    try {//from   w ww.j a v  a 2  s  .  c  om
        File modelFolder = new File(RConfig.modelFolder);
        File[] models = modelFolder.listFiles();
        if (models != null)
            for (int i = 0; i < models.length; i++) {
                String fn = models[i].getName();
                if (!fn.endsWith(className + ".model"))
                    continue;
                String domainOS = fn.substring(0, fn.length() - className.length() - ".model".length() - 1);
                Classifier classifier;
                classifier = (Classifier) (Class.forName(className).newInstance());
                classifier = (Classifier) SerializationHelper.read(RConfig.modelFolder + fn);
                // System.out.println(domainOS);
                domainOSModel.put(domainOS, classifier);

                ArffLoader loader = new ArffLoader();
                String arffStructureFile = RConfig.arffFolder + domainOS + ".arff";
                File af = new File(arffStructureFile);
                if (!af.exists())
                    continue;
                loader.setFile(new File(arffStructureFile));
                Instances structure;
                try {
                    structure = loader.getStructure();
                } catch (Exception e) {
                    continue;
                }
                structure.setClassIndex(structure.numAttributes() - 1);
                domainOSStruct.put(domainOS, structure);
                Map<String, Integer> fi = new HashMap<String, Integer>();
                for (int j = 0; j < structure.numAttributes(); j++) {
                    fi.put(structure.attribute(j).name(), j);
                }
                domainOSFeature.put(domainOS, fi);
            }
    } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
        e.printStackTrace();
        return false;
    } catch (Exception e) {
        e.printStackTrace();
        return false;
    }
    isModelLoaded = true;
    return true;
}

From source file:meka.classifiers.multilabel.AbstractMultiLabelClassifier.java

License:Open Source License

/**
 * TestCapabilities./*from w w  w .j ava2 s. c o m*/
 * Make sure the training data is suitable.
 * @param D   the data
 */
public void testCapabilities(Instances D) throws Exception {
    // get the classifier's capabilities, enable all class attributes and do the usual test
    Capabilities cap = getCapabilities();
    cap.enableAllClasses();
    //getCapabilities().testWithFail(D);
    // get the capabilities again, test class attributes individually
    int L = D.classIndex();
    for (int j = 0; j < L; j++) {
        Attribute c = D.attribute(j);
        cap.testWithFail(c, true);
    }
}

From source file:meka.classifiers.multilabel.Evaluation.java

License:Open Source License

/**
 * IsMT - see if dataset D is multi-target (else only multi-label)
 * @param   D   data/* w w w  . j  a v a 2  s .  c  o m*/
 * @return   true iff D is multi-target only (else false)
 */
public static boolean isMT(Instances D) {
    int L = D.classIndex();
    for (int j = 0; j < L; j++) {
        if (D.attribute(j).isNominal()) {
            // Classification
            if (D.attribute(j).numValues() > 2) {
                // Multi-class
                return true;
            }
        } else {
            // Regression?
            System.err.println("[Warning] Found a non-nominal class -- not sure how this happened?");
        }
    }
    return false;
}

From source file:meka.classifiers.multilabel.PCC.java

License:Open Source License

/**
 * GetKs - return [K_1,K_2,...,K_L] where each Y_j \in {1,...,K_j}.
 * In the multi-label case, K[j] = 2 for all j = 1,...,L.
 * @param   D   a dataset//from  www.  ja  v a2s.c o  m
 * @return   an array of the number of values that each label can take
 */
private static int[] getKs(Instances D) {
    int L = D.classIndex();
    int K[] = new int[L];
    for (int k = 0; k < L; k++) {
        K[k] = D.attribute(k).numValues();
    }
    return K;
}