List of usage examples for org.apache.commons.collections4.map MultiValueMap get
Object get(Object key);
From source file:com.farhad.ngram.lang.detector.profile.LanguageProfile.java
public void construct(String path) { MultiValueMap corpus = filetools.readFile(path); Iterator<String> it = corpus.keySet().iterator(); Map<String, Map> profile = new ConcurrentHashMap<>(); //iterate over each class while (it.hasNext()) { String theKey = (String) it.next(); List<String> texts = (List<String>) corpus.get(theKey); Num_Docs = texts.size();/*from w w w . ja va 2 s . co m*/ Map<Integer, Map> ngrams_lang = new HashMap<Integer, Map>(); Map<String, MyPair> ngrams_profile = new ConcurrentHashMap<>(); //iterate over each text for (int i = 0; i < texts.size(); i++) { String text = texts.get(i); Map<String, Integer> grams = new HashMap<>(); for (int n = 1; n <= ngrams; n++) { grams.putAll(NgramExtractor.gramLength(n).extractCountedGrams(text)); } ngrams_lang.put(i, grams); } Iterator<Integer> itt = ngrams_lang.keySet().iterator(); while (itt.hasNext()) { Map<String, Integer> ngram = ngrams_lang.get(itt.next()); Iterator<String> ittt = ngram.keySet().iterator(); while (ittt.hasNext()) { String ng = ittt.next(); if (ngrams_profile.containsKey(ng)) { MyPair pair = ngrams_profile.get(ng); pair.setFirst(pair.getFirst() + ngram.get(ng)); pair.setSecond(pair.getSecond() + 1); ngrams_profile.put(ng, pair); } else { MyPair pair = new MyPair(ngram.get(ng), 1); ngrams_profile.put(ng, pair); } } } profile.put(theKey, ngrams_profile); } //filter based on doc_frequency and term_frequency Iterator<String> p_it = profile.keySet().iterator(); while (p_it.hasNext()) { String lang = p_it.next(); List<String> texts = (List<String>) corpus.get(lang); Num_Docs = texts.size(); Map<String, MyPair> ngram = profile.get(lang); Iterator<String> ngram_it = ngram.keySet().iterator(); while (ngram_it.hasNext()) { String key = ngram_it.next(); MyPair freq = ngram.get(key); if (freq.getFirst() <= MIN_TERM_FREQUENCY | freq.getSecond() >= Num_Docs) { ngram.remove(key); } } } //computer the term frequecny for each n-gram p_it = profile.keySet().iterator(); while (p_it.hasNext()) { String lang = p_it.next(); List<String> texts = (List<String>) corpus.get(lang); Num_Docs = texts.size(); Map<String, MyPair> ngram = profile.get(lang); int N = ngram.keySet().size(); Iterator<String> ngram_it = ngram.keySet().iterator(); Map<String, Double> ngram_tfidf = new HashMap<>(); while (ngram_it.hasNext()) { String key = ngram_it.next(); MyPair freq = ngram.get(key); double tf = (double) freq.getFirst() / N; ngram_tfidf.put(key, tf); } //write the language profile String filename = lang + ".profile"; saveProfile(filename, ngram_tfidf); } }