Java tutorial
package etc.aloe.oilspill2010; import etc.aloe.data.ExampleSet; import etc.aloe.data.FeatureSpecification; import etc.aloe.filters.SimpleStringToWordVector; import etc.aloe.filters.WordFeaturesExtractor; import etc.aloe.oilspill2010.FeatureGenerationImpl; import java.util.List; import java.util.regex.Pattern; import weka.core.Instances; import weka.core.SelectedTag; import weka.filters.Filter; import weka.filters.unsupervised.attribute.RemoveByName; import weka.filters.unsupervised.attribute.StringToWordVector; /* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ /** * * @author mjbrooks */ public class BigramFeatureGenerationImpl extends FeatureGenerationImpl { public BigramFeatureGenerationImpl(List<String> emoticonDictionary) { super(emoticonDictionary); } @Override public FeatureSpecification generateFeatures(ExampleSet basicExamples) { ExampleSet examples = basicExamples.copy(); FeatureSpecification spec = new FeatureSpecification(); System.out.print("Configuring features over " + examples.size() + " examples... "); try { spec.addFilter(getPronounsFilter(examples)); spec.addFilter(getPunctuationFilter(examples)); spec.addFilter(getSpecialWordsFilter(examples)); spec.addFilter(getSpellingFilter(examples)); spec.addFilter(getEmoticonsFilter(examples)); spec.addFilter(getUnigramBigramFilter(examples)); spec.addFilter(getParticipantsFilter(examples)); spec.addFilter(getRemoveIDFilter(examples)); spec.addFilter(getRemoveMessageFilter(examples)); //spec.addFilter(getSparseToNonsparseFilter(examples)); //spec.addFilter(getFeatureSelectionFilter(examples)); Instances output = spec.getOutputFormat(); int numAttrs = output.numAttributes(); System.out.println("generated " + (numAttrs - 1) + " features."); } catch (Exception e) { System.err.println("Error generating features."); System.err.println("\t" + e.getMessage()); } return spec; } /** * Get a bag of words filter based on the provided examples. * * @param examples * @return * @throws Exception */ protected Filter getUnigramBigramFilter(ExampleSet examples) throws Exception { WordFeaturesExtractor filter = new WordFeaturesExtractor(); filter.setSelectedAttributeName(ExampleSet.MESSAGE_ATTR_NAME); filter.setLowerCaseTokens(true); //use stemming and remove "nonsense" filter.setStemmer(new SimpleStringToWordVector.NoNonsenseStemmer(false)); filter.setUseBigrams(true); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } /** * Get a filter that removes the id attribute from the data set, necessary * before training. * * @param examples * @return * @throws Exception */ protected Filter getRemoveMessageFilter(ExampleSet examples) throws Exception { RemoveByName filter = new RemoveByName(); filter.setExpression(Pattern.quote(ExampleSet.MESSAGE_ATTR_NAME)); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } }