Example usage for weka.filters.unsupervised.attribute StringToWordVector TAGS_FILTER

List of usage examples for weka.filters.unsupervised.attribute StringToWordVector TAGS_FILTER

Introduction

In this page you can find the example usage for weka.filters.unsupervised.attribute StringToWordVector TAGS_FILTER.

Prototype

Tag[] TAGS_FILTER

To view the source code for weka.filters.unsupervised.attribute StringToWordVector TAGS_FILTER.

Click Source Link

Document

Specifies whether document's (instance's) word frequencies are to be normalized.

Usage

From source file:etc.aloe.cscw2013.FeatureGenerationImpl.java

License:Open Source License

/**
 * Get a bag of words filter based on the provided examples.
 *
 * @param examples// ww w. ja v a  2s  .  com
 * @return
 * @throws Exception
 */
protected Filter getBagOfWordsFilter(ExampleSet examples) throws Exception {
    SimpleStringToWordVector filter = new SimpleStringToWordVector();
    filter.setAttributeNamePrefix(BAG_OF_WORDS_FEATURE_PREFIX);
    filter.setStringAttributeName(ExampleSet.MESSAGE_ATTR_NAME);

    //This is stupid because it depends on how much data you use
    //bagger.setMinTermFreq(20);

    filter.setDoNotOperateOnPerClassBasis(true);
    filter.setWordsToKeep(800);
    filter.setLowerCaseTokens(true);

    //use stemming and remove "nonsense"
    filter.setStemmer(new NoNonsenseStemmer(true));

    filter.setTFTransform(true);
    filter.setIDFTransform(true);
    filter.setNormalizeDocLength(
            new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER));

    filter.setOutputWordCounts(true);

    filter.setInputFormat(examples.getInstances());
    Instances filtered = Filter.useFilter(examples.getInstances(), filter);
    examples.setInstances(filtered);

    return filter;
}

From source file:etc.aloe.oilspill2010.FeatureGenerationImpl.java

License:Open Source License

/**
 * Get a bag of words filter based on the provided examples.
 *
 * @param examples/*from  ww  w .j  av  a 2s  . co  m*/
 * @return
 * @throws Exception
 */
@Override
protected Filter getBagOfWordsFilter(ExampleSet examples) throws Exception {
    SimpleStringToWordVector filter = new SimpleStringToWordVector();
    filter.setAttributeNamePrefix(BAG_OF_WORDS_FEATURE_PREFIX);
    filter.setStringAttributeName(ExampleSet.MESSAGE_ATTR_NAME);

    //This is stupid because it depends on how much data you use
    //bagger.setMinTermFreq(20);

    filter.setDoNotOperateOnPerClassBasis(true);
    filter.setWordsToKeep(3000);
    filter.setLowerCaseTokens(true);

    //use stemming and remove "nonsense"
    filter.setStemmer(new NoNonsenseStemmer(true));

    filter.setTFTransform(true);
    filter.setIDFTransform(true);
    filter.setNormalizeDocLength(
            new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER));

    filter.setOutputWordCounts(true);

    filter.setInputFormat(examples.getInstances());
    Instances filtered = Filter.useFilter(examples.getInstances(), filter);
    examples.setInstances(filtered);

    return filter;
}

From source file:nl.uva.expose.classification.WekaClassification.java

private void getWordVector(Instances dRaw, Instances dFiltered) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    filter.setAttributeIndices("first-last");
    filter.setIDFTransform(true);//from w w  w.j  a  v a  2  s.c o m
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(2);
    filter.setLowerCaseTokens(true);
    filter.setNormalizeDocLength(
            new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER));
    filter.setOutputWordCounts(true);
    //        filter.setTokenizer();
    //        filter.setWordsToKeep();
    filter.setInputFormat(dRaw);
    dFiltered = Filter.useFilter(dRaw, filter);
}