Example usage for weka.filters.unsupervised.attribute StringToWordVector setOutputWordCounts

List of usage examples for weka.filters.unsupervised.attribute StringToWordVector setOutputWordCounts

Introduction

In this page you can find the example usage for weka.filters.unsupervised.attribute StringToWordVector setOutputWordCounts.

Prototype

public void setOutputWordCounts(boolean outputWordCounts) 

Source Link

Document

Sets whether output instances contain 0 or 1 indicating word presence, or word counts.

Usage

From source file:com.hack23.cia.service.impl.action.user.wordcount.WordCounterImpl.java

License:Apache License

@Override
public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData,
        final int maxResult) {

    final String html = documentContentData.getContent();

    final Attribute input = new Attribute("html", (ArrayList<String>) null);

    final ArrayList<Attribute> inputVec = new ArrayList<>();
    inputVec.add(input);/*ww w .java2 s  . co  m*/

    final Instances htmlInst = new Instances("html", inputVec, 1);

    htmlInst.add(new DenseInstance(1));
    htmlInst.instance(0).setValue(0, html);

    final StopwordsHandler StopwordsHandler = new StopwordsHandler() {

        @Override
        public boolean isStopword(final String word) {

            return word.length() < 5;
        }
    };

    final NGramTokenizer tokenizer = new NGramTokenizer();
    tokenizer.setNGramMinSize(1);
    tokenizer.setNGramMaxSize(1);
    tokenizer.setDelimiters(" \r\n\t.,;:'\"()?!'");

    final StringToWordVector filter = new StringToWordVector();
    filter.setTokenizer(tokenizer);
    filter.setStopwordsHandler(StopwordsHandler);
    filter.setLowerCaseTokens(true);
    filter.setOutputWordCounts(true);
    filter.setWordsToKeep(maxResult);

    final Map<String, Integer> result = new HashMap<>();

    try {
        filter.setInputFormat(htmlInst);
        final Instances dataFiltered = Filter.useFilter(htmlInst, filter);

        final Instance last = dataFiltered.lastInstance();

        final int numAttributes = last.numAttributes();

        for (int i = 0; i < numAttributes; i++) {
            result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i)));
        }
    } catch (final Exception e) {
        LOGGER.warn("Problem calculating wordcount for : {} , exception:{}", documentContentData.getId(), e);
    }

    return result;
}

From source file:nl.uva.expose.classification.WekaClassification.java

private void getWordVector(Instances dRaw, Instances dFiltered) throws Exception {
    StringToWordVector filter = new StringToWordVector();
    filter.setAttributeIndices("first-last");
    filter.setIDFTransform(true);//  w w w. j a va 2  s. c om
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(2);
    filter.setLowerCaseTokens(true);
    filter.setNormalizeDocLength(
            new SelectedTag(StringToWordVector.FILTER_NORMALIZE_ALL, StringToWordVector.TAGS_FILTER));
    filter.setOutputWordCounts(true);
    //        filter.setTokenizer();
    //        filter.setWordsToKeep();
    filter.setInputFormat(dRaw);
    dFiltered = Filter.useFilter(dRaw, filter);
}

From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerBoolean.java

License:Open Source License

public void indexingToTokenizer(String inPath, String outPath) throws Exception {
    WordTokenizer wordTokenizer = new WordTokenizer();
    wordTokenizer.setDelimiters("\r \t.,;:'\"()?!");

    Instances inputInstances = WekaUtils.loadARFF(inPath);
    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(inputInstances);
    filter.setDoNotOperateOnPerClassBasis(false);
    filter.setInvertSelection(false);//from  w ww.ja  va2 s.  c o  m
    filter.setLowerCaseTokens(true);
    filter.setOutputWordCounts(false);
    filter.setTokenizer(wordTokenizer);
    filter.setUseStoplist(true);
    filter.setWordsToKeep(wordsTokeep);

    Instances outputInstances = Filter.useFilter(inputInstances, filter);

    OutputStreamUtils.writeSimple(outputInstances.toString(), outPath);
}

From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerIDFT.java

License:Open Source License

public void indexingToTokenizer(String inPath, String outPath) throws Exception {

    WordTokenizer wordTokenizer = new WordTokenizer();
    wordTokenizer.setDelimiters("\r \t.,;:'\"()?!");

    Instances inputInstances = WekaUtils.loadARFF(inPath);
    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(inputInstances);
    filter.setIDFTransform(true);/*w  ww  . ja va  2 s .  co m*/
    filter.setTFTransform(true);
    filter.setDoNotOperateOnPerClassBasis(false);
    filter.setInvertSelection(false);
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(3);
    filter.setOutputWordCounts(true);
    filter.setTokenizer(wordTokenizer);
    filter.setUseStoplist(true);
    filter.setWordsToKeep(200);

    Instances outputInstances = Filter.useFilter(inputInstances, filter);

    OutputStreamUtils.writeSimple(outputInstances.toString(), outPath);
}

From source file:org.montp2.m1decol.ter.gramms.filters.FilterTokenizerVector.java

License:Open Source License

public void indexingToTokenizer(String inPath, String outPath) throws Exception {
    WordTokenizer wordTokenizer = new WordTokenizer();
    wordTokenizer.setDelimiters("\r \t.,;:'\"()?!");

    Instances inputInstances = WekaUtils.loadARFF(inPath);
    StringToWordVector filter = new StringToWordVector();
    filter.setInputFormat(inputInstances);
    filter.setDoNotOperateOnPerClassBasis(false);
    filter.setInvertSelection(false);//  w  ww.  ja v  a  2 s .c o  m
    filter.setLowerCaseTokens(true);
    filter.setMinTermFreq(3);
    filter.setOutputWordCounts(true);
    filter.setTokenizer(wordTokenizer);
    filter.setUseStoplist(true);
    filter.setWordsToKeep(200);

    Instances outputInstances = Filter.useFilter(inputInstances, filter);

    OutputStreamUtils.writeSimple(outputInstances.toString(), outPath);
}