List of usage examples for weka.filters.unsupervised.attribute StringToWordVector TAGS_FILTER
Tag[] TAGS_FILTER
To view the source code for weka.filters.unsupervised.attribute StringToWordVector TAGS_FILTER.
Click Source Link
From source file:etc.aloe.cscw2013.FeatureGenerationImpl.java
License:Open Source License
/** * Get a bag of words filter based on the provided examples. * * @param examples// ww w. ja v a 2s . com * @return * @throws Exception */ protected Filter getBagOfWordsFilter(ExampleSet examples) throws Exception { SimpleStringToWordVector filter = new SimpleStringToWordVector(); filter.setAttributeNamePrefix(BAG_OF_WORDS_FEATURE_PREFIX); filter.setStringAttributeName(ExampleSet.MESSAGE_ATTR_NAME); //This is stupid because it depends on how much data you use //bagger.setMinTermFreq(20); filter.setDoNotOperateOnPerClassBasis(true); filter.setWordsToKeep(800); filter.setLowerCaseTokens(true); //use stemming and remove "nonsense" filter.setStemmer(new NoNonsenseStemmer(true)); filter.setTFTransform(true); filter.setIDFTransform(true); filter.setNormalizeDocLength( new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER)); filter.setOutputWordCounts(true); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; }
From source file:etc.aloe.oilspill2010.FeatureGenerationImpl.java
License:Open Source License
/** * Get a bag of words filter based on the provided examples. * * @param examples/*from ww w .j av a 2s . co m*/ * @return * @throws Exception */ @Override protected Filter getBagOfWordsFilter(ExampleSet examples) throws Exception { SimpleStringToWordVector filter = new SimpleStringToWordVector(); filter.setAttributeNamePrefix(BAG_OF_WORDS_FEATURE_PREFIX); filter.setStringAttributeName(ExampleSet.MESSAGE_ATTR_NAME); //This is stupid because it depends on how much data you use //bagger.setMinTermFreq(20); filter.setDoNotOperateOnPerClassBasis(true); filter.setWordsToKeep(3000); filter.setLowerCaseTokens(true); //use stemming and remove "nonsense" filter.setStemmer(new NoNonsenseStemmer(true)); filter.setTFTransform(true); filter.setIDFTransform(true); filter.setNormalizeDocLength( new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER)); filter.setOutputWordCounts(true); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; }
From source file:nl.uva.expose.classification.WekaClassification.java
private void getWordVector(Instances dRaw, Instances dFiltered) throws Exception { StringToWordVector filter = new StringToWordVector(); filter.setAttributeIndices("first-last"); filter.setIDFTransform(true);//from w w w.j a v a 2 s.c o m filter.setLowerCaseTokens(true); filter.setMinTermFreq(2); filter.setLowerCaseTokens(true); filter.setNormalizeDocLength( new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER)); filter.setOutputWordCounts(true); // filter.setTokenizer(); // filter.setWordsToKeep(); filter.setInputFormat(dRaw); dFiltered = Filter.useFilter(dRaw, filter); }