Example usage for weka.filters.unsupervised.attribute StringToWordVector setIDFTransform

List of usage examples for weka.filters.unsupervised.attribute StringToWordVector setIDFTransform

Introduction

In this page you can find the example usage for weka.filters.unsupervised.attribute StringToWordVector setIDFTransform.

Prototype

public void setIDFTransform(boolean IDFTransform) 

Source Link

Document

Sets whether if the word frequencies in a document should be transformed into:
fij*log(num of Docs/num of Docs with word i)
where fij is the frequency of word i in document(instance) j.

Usage

From source file:nl.uva.expose.classification.WekaClassification.java

private void getWordVector(Instances dRaw, Instances dFiltered) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    filter.setAttributeIndices("first-last");
    filter.setIDFTransform(true);
    filter.setLowerCaseTokens(true);/*from   w  ww  .  java  2 s.c o m*/
    filter.setMinTermFreq(2);
    filter.setLowerCaseTokens(true);
    filter.setNormalizeDocLength(
            new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER));
    filter.setOutputWordCounts(true);
    //        filter.setTokenizer();
    //        filter.setWordsToKeep();
    filter.setInputFormat(dRaw);
    dFiltered = Filter.useFilter(dRaw, filter);
}

From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerIDFT.java

License:Open Source License

public void indexingToTokenizer(String inPath, String outPath) throws Exception {

    WordTokenizer wordTokenizer = new WordTokenizer();
    wordTokenizer.setDelimiters("\r \t.,;:'\"()?!");

    Instances inputInstances = WekaUtils.loadARFF(inPath);
    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(inputInstances);
    filter.setIDFTransform(true);
    filter.setTFTransform(true);//from   w w w.j  ava  2  s  . co m
    filter.setDoNotOperateOnPerClassBasis(false);
    filter.setInvertSelection(false);
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(3);
    filter.setOutputWordCounts(true);
    filter.setTokenizer(wordTokenizer);
    filter.setUseStoplist(true);
    filter.setWordsToKeep(200);

    Instances outputInstances = Filter.useFilter(inputInstances, filter);

    OutputStreamUtils.writeSimple(outputInstances.toString(), outPath);
}

From source file:util.FeatureExtract.java

public static void createArff(String directory) {
    TextDirectoryLoader loader = new TextDirectoryLoader();
    try {/*from   ww  w.  j a  v  a 2  s .c  o  m*/
        // convert the directory into a dataset
        loader.setDirectory(new File(directory));
        Instances dataRaw = loader.getDataSet();

        // apply the StringToWordVector and tf-idf weighting
        StringToWordVector filter = new StringToWordVector();
        filter.setIDFTransform(true);
        filter.setInputFormat(dataRaw);
        Instances dataFiltered = Filter.useFilter(dataRaw, filter);

        // output the arff file
        ArffSaver saver = new ArffSaver();
        saver.setInstances(dataFiltered);
        saver.setFile(new File(SpamFilterConfig.getArffFilePath()));
        saver.writeBatch();

        // train with simple cart
        SimpleCart classifier = new SimpleCart();
        classifier.buildClassifier(dataFiltered);
        System.out.println("\n\nClassifier model:\n\n" + classifier.toString());

        // using 10 cross validation
        Evaluation eval = new Evaluation(dataFiltered);
        eval.crossValidateModel(classifier, dataFiltered, 10, new Random(1));

        System.out.println("\n\nCross fold:\n\n" + eval.toSummaryString());
    } catch (Exception ex) {
        Logger.getLogger(FeatureExtract.class.getName()).log(Level.SEVERE, null, ex);
    }
}