pl.coffeepower.blog.examples.LicenseWordsAnalyzer.java Source code

Java tutorial

Introduction

Here is the source code for pl.coffeepower.blog.examples.LicenseWordsAnalyzer.java

Source

/*
 * The MIT License (MIT)
 *
 * Copyright (c) 2015 Micha Jonko
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

package pl.coffeepower.blog.examples;

import com.google.common.io.Resources;

import lombok.extern.log4j.Log4j2;

import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;

import pl.coffeepower.blog.examples.counters.SimpleWordsFrequencyCounter;

import java.io.IOException;
import java.util.Map;

@Log4j2
public final class LicenseWordsAnalyzer {

    public static final String TOKEN_MODEL = "en-token.bin";
    public static final int TOP_WORDS_LIMIT = 3;
    public static final String LICENSE = "The MIT License (MIT)\n" + "Copyright (c) 2015 Micha Jonko\n"
            + "Permission is hereby granted, free of charge, to any person obtaining a copy\n"
            + "of this software and associated documentation files (the \"Software\"), to deal\n"
            + "in the Software without restriction, including without limitation the rights\n"
            + "to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n"
            + "copies of the Software, and to permit persons to whom the Software is\n"
            + "furnished to do so, subject to the following conditions:\n"
            + "The above copyright notice and this permission notice shall be included in all\n"
            + "copies or substantial portions of the Software.\n"
            + "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n"
            + "IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n"
            + "FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\n"
            + "AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n"
            + "LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n"
            + "OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\n" + "SOFTWARE.";
    private final Tokenizer tokenizer;

    public LicenseWordsAnalyzer() throws IOException {
        this.tokenizer = new TokenizerME(new TokenizerModel(Resources.getResource(TOKEN_MODEL)));
    }

    public static void main(String[] args) throws IOException {
        new LicenseWordsAnalyzer().calculateWordsFrequency().entrySet().stream()
                .sorted((entry1, entry2) -> entry2.getValue() - entry1.getValue()).limit(TOP_WORDS_LIMIT)
                .forEach(entry -> log.info("{} : {}", entry.getKey(), entry.getValue()));
    }

    public final Map<String, Integer> calculateWordsFrequency() {
        WordsFrequencyCounter wordsFrequencyCounter = new SimpleWordsFrequencyCounter();
        for (String word : tokenizer.tokenize(LICENSE)) {
            wordsFrequencyCounter.increase(word);
        }
        return wordsFrequencyCounter.wordsFrequency();
    }
}