List of usage examples for edu.stanford.nlp.stats IntCounter IntCounter
public IntCounter()
From source file:ilcc.ccgparser.nnparser.IncNNParser.java
public Dataset genTrainExamples(List<CCGJSentence> sents, List<CCGJTreeNode> trees) throws IOException { int numTrans = actsList.size(); Dataset ret = new Dataset(config.numTokens, numTrans); Counter<Integer> tokPosCount = new IntCounter<>(); System.err.println(Config.SEPARATOR); System.err.println("Generate training examples..."); System.err.println("With #transitions: " + numTrans); double start = (long) (System.currentTimeMillis()), end; System.err.println("Started at: " + new Date(System.currentTimeMillis())); for (int i = 0; i < sents.size(); ++i) { if (i > 0) { //System.err.print(i + " "); if (i % 1000 == 0) System.err.print(i + " "); if (i % 10000 == 0 || i == sents.size() - 1) System.err.println(); }// w w w . ja v a 2 s . c o m CCGJSentence sent = sents.get(i); if (sent == null) continue; srparser.initVars(sent); List<ArcJAction> gActList = goldDetails.get(i + 1).getarcActs(); for (ArcJAction gAct : gActList) { ArrayList<ArcJAction> acts = getAction(srparser); ArrayList<Integer> rightPerList = null; int stacksize = srparser.stack.size(); if (srparser.incalgo && stacksize > 1) { CCGJTreeNode left = srparser.stack.get(stacksize - 2); Integer lvertex = left.getConllNode().getNodeId(); rightPerList = srparser.depGraph.getRightPer(lvertex); } List<Integer> feature = getFeatures(srparser, rightPerList, sent); List<Integer> label = new ArrayList<>(Collections.nCopies(numTrans, -1)); for (ArcJAction act : acts) { Integer id = actsMap.get(act); if (id != null) { if (act.equals(gAct)) label.set(id, 1); else label.set(id, 0); } } ret.addExample(feature, label); for (int j = 0; j < feature.size(); ++j) tokPosCount.incrementCount(feature.get(j) * feature.size() + j); srparser.applyAction(gAct); } } System.err.println("#Train Examples: " + ret.n); end = (long) System.currentTimeMillis(); System.err.println("Ended at : " + new Date(System.currentTimeMillis()) + " taking " + 0.001 * (end - start) + " secs"); List<Integer> sortedTokens = Counters.toSortedList(tokPosCount, false); preComputed = new ArrayList<>( sortedTokens.subList(0, Math.min(config.numPreComputed, sortedTokens.size()))); return ret; }
From source file:lv.lnb.ner.MergeEntityInformation.java
License:Open Source License
private static void add_word(String word, String normalform, String category, String doc, int mention_count) { if (word.trim().length() < 3) return;/*w w w . j a v a 2 s. c o m*/ if (word.contains(",")) { String[] wordparts = word.split(","); String[] formparts = normalform.split(","); if (wordparts.length != formparts.length) { System.err.println(String.format("Nesakrt komatu skaits '%s' un '%s'.", word, normalform)); return; } for (int i = 0; i < wordparts.length; i++) { add_word(wordparts[i].trim(), formparts[i].trim(), category, doc, mention_count); } return; } if (word.contains(" .")) { String[] wordparts = word.split(Pattern.quote(" .")); String[] formparts = normalform.split(Pattern.quote(" .")); if (wordparts.length != formparts.length) { System.err.println(String.format("Nesakrt punktu skaits '%s' un '%s'.", word, normalform)); return; } for (int i = 0; i < wordparts.length; i++) { add_word(wordparts[i].trim(), formparts[i].trim(), category, doc, mention_count); } return; } word = word.replace("CX", "CK"); word = word.replace(" eela", " iela"); word = word.replace("Kr .", "Kr."); word = word.replace("Riga", "Rga"); word = word.replace("Kreemija", "Krievija"); word = word.replace("Kreewija", "Krievija"); word = word.replace("Wahzija", "V?cija"); word = word.replace("Cehoslovakija", "ehoslovakija"); word = word.replaceAll("[aA]ugst?k?s [pP]adomes$", "Augst?k? Padome"); word = word.replaceAll("[fF]ederatvaj? [rR]epublik?$", "Federatv? Republika"); word = word.replaceAll("[dD]emokr?tiskaj? [rR]epublik?$", "Demokr?tisk? Republika"); word = word.replaceAll("[kK]omunistisk?s [pP]artijas$", "Komunistisk? Partija"); word = word.replace(" lela", " iela"); word = word.replace(" lels", " iela"); word = word.replace("dzv. ", ""); if (word.equalsIgnoreCase("Hitlera")) word = "Hitlers"; if (word.equalsIgnoreCase("Raia")) word = "Rainis"; if (word.equalsIgnoreCase("Vtola")) word = "Vtols"; if (word.equalsIgnoreCase("Mocarta")) word = "Mocarts"; if (word.equalsIgnoreCase("aikovska")) word = "aikovskis"; if (word.equalsIgnoreCase("Daugavpil")) word = "Daugavpils"; if (word.equalsIgnoreCase("Liep?jas")) word = "Liep?ja"; if (word.equalsIgnoreCase("Liep?j?")) word = "Liep?ja"; if (word.equalsIgnoreCase("Csu")) word = "Csis"; if (word.equalsIgnoreCase("Blaumaa")) word = "Blaumanis"; if (word.equalsIgnoreCase("Lietavas")) word = "Lietava"; if (word.equalsIgnoreCase("ns")) word = "na"; if (word.equalsIgnoreCase("Strencis")) word = "Stren?i"; if (word.equalsIgnoreCase("Rig?")) word = "Rga"; if (word.equalsIgnoreCase("Kubs")) word = "Kuba"; if (word.equalsIgnoreCase("Rig?")) word = "Rga"; if (word.equalsIgnoreCase("Kijevs")) word = "Kijeva"; if (word.equalsIgnoreCase("Latmija")) word = "Latvija"; if (word.equalsIgnoreCase("Trbat?")) word = "Trbata"; if (word.equalsIgnoreCase("Sabil")) word = "Sabile"; if (word.equalsIgnoreCase("Melluzis")) word = "Mellui"; if (word.equalsIgnoreCase("Polijas") || word.equalsIgnoreCase("Polij?")) word = "Polija"; //hack nesaprastiem locjumiem if (word.endsWith("ijas")) word = word.substring(0, word.length() - 1); if (word.endsWith("ij?")) word = word.substring(0, word.length() - 1) + "a"; normalform = normalform.replace("CX", "CK"); normalform = normalform.replace("|", ""); normalform = normalform.replace(" eela", " iela"); normalform = normalform.replace("Kr .", "Kr."); normalform = normalform.replace("riga", "rga"); normalform = normalform.replace("kreewija", "krievija"); normalform = normalform.replace("kreemija", "krievija"); normalform = normalform.replace("wahzija", "v?cija"); normalform = normalform.replace(" lels", " iela"); normalform = normalform.replace("dzv. ", ""); if (normalform.equalsIgnoreCase("Hitlera")) normalform = "hitlers"; if (normalform.equalsIgnoreCase("Raia")) normalform = "rainis"; if (normalform.equalsIgnoreCase("Vtola")) normalform = "vtols"; if (normalform.equalsIgnoreCase("Mocarta")) normalform = "mocarts"; if (normalform.equalsIgnoreCase("aikovska")) normalform = "?aikovskis"; if (normalform.equalsIgnoreCase("Daugavpil")) normalform = "daugavpils"; if (normalform.equalsIgnoreCase("Liep?jas")) normalform = "liep?ja"; if (normalform.equalsIgnoreCase("Liep?j?")) normalform = "liep?ja"; if (normalform.equalsIgnoreCase("blaumaa")) normalform = "blaumanis"; if (normalform.equalsIgnoreCase("ns")) normalform = "na"; if (normalform.equalsIgnoreCase("Polijas") || normalform.equalsIgnoreCase("Polijs")) normalform = "polija"; if (normalform.endsWith("ijas")) normalform = normalform.substring(0, normalform.length() - 1); if (normalform.endsWith("ij?")) normalform = normalform.substring(0, normalform.length() - 1) + "a"; if (blacklist.containsKey(word)) return; if (blacklist.containsKey(normalform)) return; if (normalform.contains(" un ") && !normalform.contains("opera")) return; if (normalform.length() > 50) return; String key = category + "|" + normalform; counter.incrementCount(key, mention_count); Counter<String> forms = popular_forms.get(key); if (forms == null) { forms = new IntCounter<String>(); popular_forms.put(key, forms); } forms.incrementCount(word, mention_count); counterbydoc.incrementCount(key + "|" + doc, mention_count); }
From source file:nate.reading.SlotInducer.java
/** * Creates an overall count of arguments summed across all of the given predicates. * The predicates are a list of IDs, indexing positions in the given names list. * @param ids A set of indices into the names list. * @param names The list of tokens that have arguments in our corpus. * @param argCounts The map from token to argument counts. */// w w w .j ava 2 s .c o m private Counter<String> sumArgs(Collection<String> names, VerbArgCounts argCounts) { Counter<String> sum = new IntCounter<String>(); if (names != null && argCounts != null) { for (String slotname : names) { Map<String, Integer> subcounts = argCounts.getArgsForSlot(slotname); if (subcounts != null) { for (Map.Entry<String, Integer> entry : subcounts.entrySet()) sum.incrementCount(entry.getKey(), entry.getValue()); } } } return sum; }