Example usage for org.apache.commons.collections4.map MultiValueMap get

List of usage examples for org.apache.commons.collections4.map MultiValueMap get

Introduction

In this page you can find the example usage for org.apache.commons.collections4.map MultiValueMap get.

Prototype

Object get(Object key);

Source Link

Document

Gets the collection of values associated with the specified key.

Usage

From source file:com.farhad.ngram.lang.detector.profile.LanguageProfile.java

public void construct(String path) {
    MultiValueMap corpus = filetools.readFile(path);

    Iterator<String> it = corpus.keySet().iterator();

    Map<String, Map> profile = new ConcurrentHashMap<>();

    //iterate over each class
    while (it.hasNext()) {

        String theKey = (String) it.next();

        List<String> texts = (List<String>) corpus.get(theKey);
        Num_Docs = texts.size();/*from   w w  w  . ja  va  2 s  .  co m*/
        Map<Integer, Map> ngrams_lang = new HashMap<Integer, Map>();

        Map<String, MyPair> ngrams_profile = new ConcurrentHashMap<>();

        //iterate over each text
        for (int i = 0; i < texts.size(); i++) {

            String text = texts.get(i);
            Map<String, Integer> grams = new HashMap<>();
            for (int n = 1; n <= ngrams; n++) {

                grams.putAll(NgramExtractor.gramLength(n).extractCountedGrams(text));

            }

            ngrams_lang.put(i, grams);

        }

        Iterator<Integer> itt = ngrams_lang.keySet().iterator();

        while (itt.hasNext()) {

            Map<String, Integer> ngram = ngrams_lang.get(itt.next());
            Iterator<String> ittt = ngram.keySet().iterator();

            while (ittt.hasNext()) {

                String ng = ittt.next();

                if (ngrams_profile.containsKey(ng)) {

                    MyPair pair = ngrams_profile.get(ng);
                    pair.setFirst(pair.getFirst() + ngram.get(ng));
                    pair.setSecond(pair.getSecond() + 1);
                    ngrams_profile.put(ng, pair);

                } else {

                    MyPair pair = new MyPair(ngram.get(ng), 1);
                    ngrams_profile.put(ng, pair);
                }
            }
        }

        profile.put(theKey, ngrams_profile);

    }

    //filter based on doc_frequency and term_frequency 
    Iterator<String> p_it = profile.keySet().iterator();

    while (p_it.hasNext()) {

        String lang = p_it.next();
        List<String> texts = (List<String>) corpus.get(lang);
        Num_Docs = texts.size();

        Map<String, MyPair> ngram = profile.get(lang);

        Iterator<String> ngram_it = ngram.keySet().iterator();
        while (ngram_it.hasNext()) {
            String key = ngram_it.next();
            MyPair freq = ngram.get(key);
            if (freq.getFirst() <= MIN_TERM_FREQUENCY | freq.getSecond() >= Num_Docs) {
                ngram.remove(key);
            }

        }

    }

    //computer the term frequecny for each n-gram 
    p_it = profile.keySet().iterator();

    while (p_it.hasNext()) {
        String lang = p_it.next();
        List<String> texts = (List<String>) corpus.get(lang);
        Num_Docs = texts.size();

        Map<String, MyPair> ngram = profile.get(lang);

        int N = ngram.keySet().size();

        Iterator<String> ngram_it = ngram.keySet().iterator();
        Map<String, Double> ngram_tfidf = new HashMap<>();

        while (ngram_it.hasNext()) {

            String key = ngram_it.next();
            MyPair freq = ngram.get(key);

            double tf = (double) freq.getFirst() / N;
            ngram_tfidf.put(key, tf);

        }

        //write the language profile 
        String filename = lang + ".profile";
        saveProfile(filename, ngram_tfidf);
    }
}