Example usage for weka.filters.unsupervised.attribute StringToWordVector FILTER_NORMALIZE_ALL

List of usage examples for weka.filters.unsupervised.attribute StringToWordVector FILTER_NORMALIZE_ALL

Introduction

In this page you can find the example usage for weka.filters.unsupervised.attribute StringToWordVector FILTER_NORMALIZE_ALL.

Prototype

int FILTER_NORMALIZE_ALL

To view the source code for weka.filters.unsupervised.attribute StringToWordVector FILTER_NORMALIZE_ALL.

Click Source Link

Document

normalization: Normalize all data.

Usage

From source file:etc.aloe.cscw2013.FeatureGenerationImpl.java

License:Open Source License

/**
 * Get a bag of words filter based on the provided examples.
 *
 * @param examples/*from  w  ww  .  ja  va  2 s.  c  om*/
 * @return
 * @throws Exception
 */
protected Filter getBagOfWordsFilter(ExampleSet examples) throws Exception {
    SimpleStringToWordVector filter = new SimpleStringToWordVector();
    filter.setAttributeNamePrefix(BAG_OF_WORDS_FEATURE_PREFIX);
    filter.setStringAttributeName(ExampleSet.MESSAGE_ATTR_NAME);

    //This is stupid because it depends on how much data you use
    //bagger.setMinTermFreq(20);

    filter.setDoNotOperateOnPerClassBasis(true);
    filter.setWordsToKeep(800);
    filter.setLowerCaseTokens(true);

    //use stemming and remove "nonsense"
    filter.setStemmer(new NoNonsenseStemmer(true));

    filter.setTFTransform(true);
    filter.setIDFTransform(true);
    filter.setNormalizeDocLength(
            new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER));

    filter.setOutputWordCounts(true);

    filter.setInputFormat(examples.getInstances());
    Instances filtered = Filter.useFilter(examples.getInstances(), filter);
    examples.setInstances(filtered);

    return filter;
}

From source file:etc.aloe.oilspill2010.FeatureGenerationImpl.java

License:Open Source License

/**
 * Get a bag of words filter based on the provided examples.
 *
 * @param examples/*from   ww  w.ja  v a  2 s  .  com*/
 * @return
 * @throws Exception
 */
@Override
protected Filter getBagOfWordsFilter(ExampleSet examples) throws Exception {
    SimpleStringToWordVector filter = new SimpleStringToWordVector();
    filter.setAttributeNamePrefix(BAG_OF_WORDS_FEATURE_PREFIX);
    filter.setStringAttributeName(ExampleSet.MESSAGE_ATTR_NAME);

    //This is stupid because it depends on how much data you use
    //bagger.setMinTermFreq(20);

    filter.setDoNotOperateOnPerClassBasis(true);
    filter.setWordsToKeep(3000);
    filter.setLowerCaseTokens(true);

    //use stemming and remove "nonsense"
    filter.setStemmer(new NoNonsenseStemmer(true));

    filter.setTFTransform(true);
    filter.setIDFTransform(true);
    filter.setNormalizeDocLength(
            new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER));

    filter.setOutputWordCounts(true);

    filter.setInputFormat(examples.getInstances());
    Instances filtered = Filter.useFilter(examples.getInstances(), filter);
    examples.setInstances(filtered);

    return filter;
}

From source file:nl.uva.expose.classification.WekaClassification.java

private void getWordVector(Instances dRaw, Instances dFiltered) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    filter.setAttributeIndices("first-last");
    filter.setIDFTransform(true);//from   w ww . ja  va 2  s .co  m
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(2);
    filter.setLowerCaseTokens(true);
    filter.setNormalizeDocLength(
            new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER));
    filter.setOutputWordCounts(true);
    //        filter.setTokenizer();
    //        filter.setWordsToKeep();
    filter.setInputFormat(dRaw);
    dFiltered = Filter.useFilter(dRaw, filter);
}