Java tutorial
/* * This file is part of ALOE. * * ALOE is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * ALOE is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * You should have received a copy of the GNU General Public License * along with ALOE. If not, see <http://www.gnu.org/licenses/>. * * Copyright (c) 2012 SCCL, University of Washington (http://depts.washington.edu/sccl) */ package etc.aloe.cscw2013; import etc.aloe.data.ExampleSet; import etc.aloe.data.FeatureSpecification; import etc.aloe.filters.PronounRegexFilter; import etc.aloe.filters.PunctuationRegexFilter; import etc.aloe.filters.SimpleStringToWordVector; import etc.aloe.filters.SimpleStringToWordVector.NoNonsenseStemmer; import etc.aloe.filters.SpecialRegexFilter; import etc.aloe.filters.SpellingRegexFilter; import etc.aloe.filters.StringToDictionaryVector; import etc.aloe.processes.FeatureGeneration; import java.util.List; import java.util.regex.Pattern; import weka.core.Instances; import weka.core.SelectedTag; import weka.filters.Filter; import weka.filters.unsupervised.attribute.RemoveByName; import weka.filters.unsupervised.attribute.StringToWordVector; /** * Generates a set of filters that extract the desired features from message * texts. * * Features include words, emoticons, pronouns, punctuations, and other strings. * * @author Michael Brooks <mjbrooks@uw.edu> */ public class FeatureGenerationImpl implements FeatureGeneration { protected static final boolean COUNT_REGEX_LENGTHS = true; protected static final String EMOTICON_FEATURE_PREFIX = "#"; protected static final String BAG_OF_WORDS_FEATURE_PREFIX = "_"; protected static final String PARTICIPANT_FEATURE_PREFIX = "."; protected final List<String> emoticonDictionary; protected int participantFeatures = 0; /** * Construct a new FeatureGeneration implementation. * * @param emoticonDictionary The list of emoticons to look for in the * messages. */ public FeatureGenerationImpl(List<String> emoticonDictionary) { this.emoticonDictionary = emoticonDictionary; } public int getParticipantFeatureCount() { return participantFeatures; } /** * Set whether the number of different participants to convert into unigram features. * If set to 0 (the default), no participant features will be used. * * @param participantFeatures */ public void setParticipantFeatureCount(int participantFeatures) { this.participantFeatures = participantFeatures; } @Override public FeatureSpecification generateFeatures(ExampleSet basicExamples) { ExampleSet examples = basicExamples.copy(); FeatureSpecification spec = new FeatureSpecification(); System.out.print("Configuring features over " + examples.size() + " examples... "); try { spec.addFilter(getPronounsFilter(examples)); spec.addFilter(getPunctuationFilter(examples)); spec.addFilter(getSpecialWordsFilter(examples)); spec.addFilter(getSpellingFilter(examples)); spec.addFilter(getEmoticonsFilter(examples)); spec.addFilter(getBagOfWordsFilter(examples)); spec.addFilter(getRemoveIDFilter(examples)); if (this.getParticipantFeatureCount() > 0) { spec.addFilter(getParticipantsFilter(examples)); } else { spec.addFilter(getRemoveParticipantFilter(examples)); } Instances output = spec.getOutputFormat(); int numAttrs = output.numAttributes(); System.out.println("generated " + (numAttrs - 1) + " features."); } catch (Exception e) { System.err.println("Error generating features."); System.err.println("\t" + e.getMessage()); } return spec; } /** * Configure the special words filter with the provided data.. * * @param examples * @return * @throws Exception */ protected Filter getSpecialWordsFilter(ExampleSet examples) throws Exception { SpecialRegexFilter filter = new SpecialRegexFilter(ExampleSet.MESSAGE_ATTR_NAME); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } /** * Configure the spelling filter to work with the provided data. * * @param examples * @return * @throws Exception */ protected Filter getSpellingFilter(ExampleSet examples) throws Exception { SpellingRegexFilter filter = new SpellingRegexFilter(ExampleSet.MESSAGE_ATTR_NAME); filter.setCountRegexLengths(COUNT_REGEX_LENGTHS); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } /** * Configure the punctuation filter to work with the provided data. * * @param examples * @return * @throws Exception */ protected Filter getPunctuationFilter(ExampleSet examples) throws Exception { PunctuationRegexFilter filter = new PunctuationRegexFilter(ExampleSet.MESSAGE_ATTR_NAME); filter.setCountRegexLengths(COUNT_REGEX_LENGTHS); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } /** * Configure the pronouns filter to work with the provided data. * * @param examples * @return * @throws Exception */ protected Filter getPronounsFilter(ExampleSet examples) throws Exception { PronounRegexFilter filter = new PronounRegexFilter(ExampleSet.MESSAGE_ATTR_NAME); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } /** * Configure the emoticons filter to work with the provided examples. * * @param examples * @return * @throws Exception */ protected Filter getEmoticonsFilter(ExampleSet examples) throws Exception { StringToDictionaryVector filter = new StringToDictionaryVector(); filter.setAttributeNamePrefix(EMOTICON_FEATURE_PREFIX); filter.setTermList(emoticonDictionary); filter.setStringAttribute(ExampleSet.MESSAGE_ATTR_NAME); filter.setWordsToKeep(100); //filter.setMinTermFreq(10); filter.setDoNotOperateOnPerClassBasis(true); filter.setOutputWordCounts(true); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } /** * Get a bag of words filter based on the provided examples. * * @param examples * @return * @throws Exception */ protected Filter getBagOfWordsFilter(ExampleSet examples) throws Exception { SimpleStringToWordVector filter = new SimpleStringToWordVector(); filter.setAttributeNamePrefix(BAG_OF_WORDS_FEATURE_PREFIX); filter.setStringAttributeName(ExampleSet.MESSAGE_ATTR_NAME); //This is stupid because it depends on how much data you use //bagger.setMinTermFreq(20); filter.setDoNotOperateOnPerClassBasis(true); filter.setWordsToKeep(800); filter.setLowerCaseTokens(true); //use stemming and remove "nonsense" filter.setStemmer(new NoNonsenseStemmer(true)); filter.setTFTransform(true); filter.setIDFTransform(true); filter.setNormalizeDocLength( new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER)); filter.setOutputWordCounts(true); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } /** * Get a filter that removes the id attribute from the data set, necessary * before training. * * @param examples * @return * @throws Exception */ protected Filter getRemoveIDFilter(ExampleSet examples) throws Exception { RemoveByName filter = new RemoveByName(); filter.setExpression(Pattern.quote(ExampleSet.ID_ATTR_NAME)); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } /** * Get a filter that removes the id attribute from the data set, necessary * before training. * * @param examples * @return * @throws Exception */ protected Filter getRemoveParticipantFilter(ExampleSet examples) throws Exception { RemoveByName filter = new RemoveByName(); filter.setExpression(Pattern.quote(ExampleSet.PARTICIPANT_ATTR_NAME)); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } /** * Get a bag of words filter for participants based on the provided examples. * * @param examples * @return * @throws Exception */ protected Filter getParticipantsFilter(ExampleSet examples) throws Exception { SimpleStringToWordVector filter = new SimpleStringToWordVector(); filter.setAttributeNamePrefix(PARTICIPANT_FEATURE_PREFIX); filter.setStringAttributeName(ExampleSet.PARTICIPANT_ATTR_NAME); filter.setDoNotOperateOnPerClassBasis(true); filter.setWordsToKeep(getParticipantFeatureCount()); filter.setLowerCaseTokens(true); //use stemming and remove "nonsense" filter.setStemmer(null); filter.setOutputWordCounts(false); filter.setInputFormat(examples.getInstances()); Instances filtered = Filter.useFilter(examples.getInstances(), filter); examples.setInstances(filtered); return filter; } }