Example usage for weka.filters.unsupervised.attribute StringToWordVector setStopwordsHandler

List of usage examples for weka.filters.unsupervised.attribute StringToWordVector setStopwordsHandler

Introduction

In this page you can find the example usage for weka.filters.unsupervised.attribute StringToWordVector setStopwordsHandler.

Prototype

public void setStopwordsHandler(StopwordsHandler value) 

Source Link

Document

Sets the stopwords handler to use.

Usage

From source file:com.hack23.cia.service.impl.action.user.wordcount.WordCounterImpl.java

License:Apache License

@Override
public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData,
        final int maxResult) {

    final String html = documentContentData.getContent();

    final Attribute input = new Attribute("html", (ArrayList<String>) null);

    final ArrayList<Attribute> inputVec = new ArrayList<>();
    inputVec.add(input);//  w ww.j a v  a 2 s .c  om

    final Instances htmlInst = new Instances("html", inputVec, 1);

    htmlInst.add(new DenseInstance(1));
    htmlInst.instance(0).setValue(0, html);

    final StopwordsHandler StopwordsHandler = new StopwordsHandler() {

        @Override
        public boolean isStopword(final String word) {

            return word.length() < 5;
        }
    };

    final NGramTokenizer tokenizer = new NGramTokenizer();
    tokenizer.setNGramMinSize(1);
    tokenizer.setNGramMaxSize(1);
    tokenizer.setDelimiters(" \r\n\t.,;:'\"()?!'");

    final StringToWordVector filter = new StringToWordVector();
    filter.setTokenizer(tokenizer);
    filter.setStopwordsHandler(StopwordsHandler);
    filter.setLowerCaseTokens(true);
    filter.setOutputWordCounts(true);
    filter.setWordsToKeep(maxResult);

    final Map<String, Integer> result = new HashMap<>();

    try {
        filter.setInputFormat(htmlInst);
        final Instances dataFiltered = Filter.useFilter(htmlInst, filter);

        final Instance last = dataFiltered.lastInstance();

        final int numAttributes = last.numAttributes();

        for (int i = 0; i < numAttributes; i++) {
            result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i)));
        }
    } catch (final Exception e) {
        LOGGER.warn("Problem calculating wordcount for : {} , exception:{}", documentContentData.getId(), e);
    }

    return result;
}