etc.aloe.oilspill2010.BigramFeatureGenerationImpl.java Source code

Java tutorial

Introduction

Here is the source code for etc.aloe.oilspill2010.BigramFeatureGenerationImpl.java

Source

package etc.aloe.oilspill2010;

import etc.aloe.data.ExampleSet;
import etc.aloe.data.FeatureSpecification;
import etc.aloe.filters.SimpleStringToWordVector;
import etc.aloe.filters.WordFeaturesExtractor;
import etc.aloe.oilspill2010.FeatureGenerationImpl;
import java.util.List;
import java.util.regex.Pattern;
import weka.core.Instances;
import weka.core.SelectedTag;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.RemoveByName;
import weka.filters.unsupervised.attribute.StringToWordVector;

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

/**
 *
 * @author mjbrooks
 */
public class BigramFeatureGenerationImpl extends FeatureGenerationImpl {

    public BigramFeatureGenerationImpl(List<String> emoticonDictionary) {
        super(emoticonDictionary);
    }

    @Override
    public FeatureSpecification generateFeatures(ExampleSet basicExamples) {

        ExampleSet examples = basicExamples.copy();
        FeatureSpecification spec = new FeatureSpecification();

        System.out.print("Configuring features over " + examples.size() + " examples... ");

        try {
            spec.addFilter(getPronounsFilter(examples));
            spec.addFilter(getPunctuationFilter(examples));
            spec.addFilter(getSpecialWordsFilter(examples));
            spec.addFilter(getSpellingFilter(examples));

            spec.addFilter(getEmoticonsFilter(examples));
            spec.addFilter(getUnigramBigramFilter(examples));
            spec.addFilter(getParticipantsFilter(examples));
            spec.addFilter(getRemoveIDFilter(examples));
            spec.addFilter(getRemoveMessageFilter(examples));
            //spec.addFilter(getSparseToNonsparseFilter(examples));
            //spec.addFilter(getFeatureSelectionFilter(examples));

            Instances output = spec.getOutputFormat();
            int numAttrs = output.numAttributes();
            System.out.println("generated " + (numAttrs - 1) + " features.");
        } catch (Exception e) {
            System.err.println("Error generating features.");
            System.err.println("\t" + e.getMessage());
        }

        return spec;
    }

    /**
     * Get a bag of words filter based on the provided examples.
     *
     * @param examples
     * @return
     * @throws Exception
     */
    protected Filter getUnigramBigramFilter(ExampleSet examples) throws Exception {
        WordFeaturesExtractor filter = new WordFeaturesExtractor();
        filter.setSelectedAttributeName(ExampleSet.MESSAGE_ATTR_NAME);

        filter.setLowerCaseTokens(true);
        //use stemming and remove "nonsense"
        filter.setStemmer(new SimpleStringToWordVector.NoNonsenseStemmer(false));

        filter.setUseBigrams(true);

        filter.setInputFormat(examples.getInstances());
        Instances filtered = Filter.useFilter(examples.getInstances(), filter);
        examples.setInstances(filtered);

        return filter;
    }

    /**
     * Get a filter that removes the id attribute from the data set, necessary
     * before training.
     *
     * @param examples
     * @return
     * @throws Exception
     */
    protected Filter getRemoveMessageFilter(ExampleSet examples) throws Exception {
        RemoveByName filter = new RemoveByName();
        filter.setExpression(Pattern.quote(ExampleSet.MESSAGE_ATTR_NAME));

        filter.setInputFormat(examples.getInstances());
        Instances filtered = Filter.useFilter(examples.getInstances(), filter);
        examples.setInstances(filtered);

        return filter;
    }
}