Example usage for weka.filters.unsupervised.attribute StringToWordVector setMinTermFreq

List of usage examples for weka.filters.unsupervised.attribute StringToWordVector setMinTermFreq

Introduction

In this page you can find the example usage for weka.filters.unsupervised.attribute StringToWordVector setMinTermFreq.

Prototype

public void setMinTermFreq(int newMinTermFreq) 

Source Link

Document

Set the MinTermFreq value.

Usage

From source file:nl.uva.expose.classification.WekaClassification.java

private void getWordVector(Instances dRaw, Instances dFiltered) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    filter.setAttributeIndices("first-last");
    filter.setIDFTransform(true);//from   w w w.  j  av a  2s  .c  om
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(2);
    filter.setLowerCaseTokens(true);
    filter.setNormalizeDocLength(
            new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER));
    filter.setOutputWordCounts(true);
    //        filter.setTokenizer();
    //        filter.setWordsToKeep();
    filter.setInputFormat(dRaw);
    dFiltered = Filter.useFilter(dRaw, filter);
}

From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerIDFT.java

License:Open Source License

public void indexingToTokenizer(String inPath, String outPath) throws Exception {

    WordTokenizer wordTokenizer = new WordTokenizer();
    wordTokenizer.setDelimiters("\r \t.,;:'\"()?!");

    Instances inputInstances = WekaUtils.loadARFF(inPath);
    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(inputInstances);
    filter.setIDFTransform(true);//from  w  w w  . ja  va  2s .  c  om
    filter.setTFTransform(true);
    filter.setDoNotOperateOnPerClassBasis(false);
    filter.setInvertSelection(false);
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(3);
    filter.setOutputWordCounts(true);
    filter.setTokenizer(wordTokenizer);
    filter.setUseStoplist(true);
    filter.setWordsToKeep(200);

    Instances outputInstances = Filter.useFilter(inputInstances, filter);

    OutputStreamUtils.writeSimple(outputInstances.toString(), outPath);
}

From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerVector.java

License:Open Source License

public void indexingToTokenizer(String inPath, String outPath) throws Exception {
    WordTokenizer wordTokenizer = new WordTokenizer();
    wordTokenizer.setDelimiters("\r \t.,;:'\"()?!");

    Instances inputInstances = WekaUtils.loadARFF(inPath);
    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(inputInstances);
    filter.setDoNotOperateOnPerClassBasis(false);
    filter.setInvertSelection(false);//w ww.j  a  v a2s  .  com
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(3);
    filter.setOutputWordCounts(true);
    filter.setTokenizer(wordTokenizer);
    filter.setUseStoplist(true);
    filter.setWordsToKeep(200);

    Instances outputInstances = Filter.useFilter(inputInstances, filter);

    OutputStreamUtils.writeSimple(outputInstances.toString(), outPath);
}