Example usage for edu.stanford.nlp.stats IntCounter IntCounter

List of usage examples for edu.stanford.nlp.stats IntCounter IntCounter

Introduction

In this page you can find the example usage for edu.stanford.nlp.stats IntCounter IntCounter.

Prototype

public IntCounter() 

Source Link

Document

Constructs a new (empty) Counter.

Usage

From source file:ilcc.ccgparser.nnparser.IncNNParser.java

public Dataset genTrainExamples(List<CCGJSentence> sents, List<CCGJTreeNode> trees) throws IOException {
    int numTrans = actsList.size();
    Dataset ret = new Dataset(config.numTokens, numTrans);

    Counter<Integer> tokPosCount = new IntCounter<>();
    System.err.println(Config.SEPARATOR);
    System.err.println("Generate training examples...");
    System.err.println("With #transitions: " + numTrans);
    double start = (long) (System.currentTimeMillis()), end;
    System.err.println("Started at: " + new Date(System.currentTimeMillis()));

    for (int i = 0; i < sents.size(); ++i) {
        if (i > 0) {
            //System.err.print(i + " ");
            if (i % 1000 == 0)
                System.err.print(i + " ");
            if (i % 10000 == 0 || i == sents.size() - 1)
                System.err.println();
        }//  w  w w . ja v  a 2  s . c o m

        CCGJSentence sent = sents.get(i);
        if (sent == null)
            continue;
        srparser.initVars(sent);
        List<ArcJAction> gActList = goldDetails.get(i + 1).getarcActs();
        for (ArcJAction gAct : gActList) {
            ArrayList<ArcJAction> acts = getAction(srparser);
            ArrayList<Integer> rightPerList = null;
            int stacksize = srparser.stack.size();
            if (srparser.incalgo && stacksize > 1) {
                CCGJTreeNode left = srparser.stack.get(stacksize - 2);
                Integer lvertex = left.getConllNode().getNodeId();
                rightPerList = srparser.depGraph.getRightPer(lvertex);
            }
            List<Integer> feature = getFeatures(srparser, rightPerList, sent);

            List<Integer> label = new ArrayList<>(Collections.nCopies(numTrans, -1));
            for (ArcJAction act : acts) {
                Integer id = actsMap.get(act);
                if (id != null) {
                    if (act.equals(gAct))
                        label.set(id, 1);
                    else
                        label.set(id, 0);
                }
            }
            ret.addExample(feature, label);
            for (int j = 0; j < feature.size(); ++j)
                tokPosCount.incrementCount(feature.get(j) * feature.size() + j);
            srparser.applyAction(gAct);
        }
    }
    System.err.println("#Train Examples: " + ret.n);
    end = (long) System.currentTimeMillis();
    System.err.println("Ended at : " + new Date(System.currentTimeMillis()) + " taking " + 0.001 * (end - start)
            + " secs");

    List<Integer> sortedTokens = Counters.toSortedList(tokPosCount, false);
    preComputed = new ArrayList<>(
            sortedTokens.subList(0, Math.min(config.numPreComputed, sortedTokens.size())));

    return ret;
}

From source file:lv.lnb.ner.MergeEntityInformation.java

License:Open Source License

private static void add_word(String word, String normalform, String category, String doc, int mention_count) {
    if (word.trim().length() < 3)
        return;/*w w w . j  a v a  2  s.  c o m*/
    if (word.contains(",")) {
        String[] wordparts = word.split(",");
        String[] formparts = normalform.split(",");
        if (wordparts.length != formparts.length) {
            System.err.println(String.format("Nesakrt komatu skaits '%s' un '%s'.", word, normalform));
            return;
        }
        for (int i = 0; i < wordparts.length; i++) {
            add_word(wordparts[i].trim(), formparts[i].trim(), category, doc, mention_count);
        }
        return;
    }
    if (word.contains(" .")) {
        String[] wordparts = word.split(Pattern.quote(" ."));
        String[] formparts = normalform.split(Pattern.quote(" ."));
        if (wordparts.length != formparts.length) {
            System.err.println(String.format("Nesakrt punktu skaits '%s' un '%s'.", word, normalform));
            return;
        }
        for (int i = 0; i < wordparts.length; i++) {
            add_word(wordparts[i].trim(), formparts[i].trim(), category, doc, mention_count);
        }
        return;
    }

    word = word.replace("CX", "CK");
    word = word.replace(" eela", " iela");
    word = word.replace("Kr .", "Kr.");
    word = word.replace("Riga", "Rga");
    word = word.replace("Kreemija", "Krievija");
    word = word.replace("Kreewija", "Krievija");
    word = word.replace("Wahzija", "V?cija");
    word = word.replace("Cehoslovakija", "ehoslovakija");
    word = word.replaceAll("[aA]ugst?k?s [pP]adomes$", "Augst?k? Padome");
    word = word.replaceAll("[fF]ederatvaj? [rR]epublik?$", "Federatv? Republika");
    word = word.replaceAll("[dD]emokr?tiskaj? [rR]epublik?$", "Demokr?tisk? Republika");
    word = word.replaceAll("[kK]omunistisk?s [pP]artijas$", "Komunistisk? Partija");
    word = word.replace(" lela", " iela");
    word = word.replace(" lels", " iela");
    word = word.replace("dzv. ", "");
    if (word.equalsIgnoreCase("Hitlera"))
        word = "Hitlers";
    if (word.equalsIgnoreCase("Raia"))
        word = "Rainis";
    if (word.equalsIgnoreCase("Vtola"))
        word = "Vtols";
    if (word.equalsIgnoreCase("Mocarta"))
        word = "Mocarts";
    if (word.equalsIgnoreCase("aikovska"))
        word = "aikovskis";
    if (word.equalsIgnoreCase("Daugavpil"))
        word = "Daugavpils";
    if (word.equalsIgnoreCase("Liep?jas"))
        word = "Liep?ja";
    if (word.equalsIgnoreCase("Liep?j?"))
        word = "Liep?ja";
    if (word.equalsIgnoreCase("Csu"))
        word = "Csis";
    if (word.equalsIgnoreCase("Blaumaa"))
        word = "Blaumanis";
    if (word.equalsIgnoreCase("Lietavas"))
        word = "Lietava";
    if (word.equalsIgnoreCase("ns"))
        word = "na";
    if (word.equalsIgnoreCase("Strencis"))
        word = "Stren?i";
    if (word.equalsIgnoreCase("Rig?"))
        word = "Rga";
    if (word.equalsIgnoreCase("Kubs"))
        word = "Kuba";
    if (word.equalsIgnoreCase("Rig?"))
        word = "Rga";
    if (word.equalsIgnoreCase("Kijevs"))
        word = "Kijeva";
    if (word.equalsIgnoreCase("Latmija"))
        word = "Latvija";
    if (word.equalsIgnoreCase("Trbat?"))
        word = "Trbata";
    if (word.equalsIgnoreCase("Sabil"))
        word = "Sabile";
    if (word.equalsIgnoreCase("Melluzis"))
        word = "Mellui";
    if (word.equalsIgnoreCase("Polijas") || word.equalsIgnoreCase("Polij?"))
        word = "Polija";
    //hack nesaprastiem locjumiem
    if (word.endsWith("ijas"))
        word = word.substring(0, word.length() - 1);
    if (word.endsWith("ij?"))
        word = word.substring(0, word.length() - 1) + "a";

    normalform = normalform.replace("CX", "CK");
    normalform = normalform.replace("|", "");
    normalform = normalform.replace(" eela", " iela");
    normalform = normalform.replace("Kr .", "Kr.");
    normalform = normalform.replace("riga", "rga");
    normalform = normalform.replace("kreewija", "krievija");
    normalform = normalform.replace("kreemija", "krievija");
    normalform = normalform.replace("wahzija", "v?cija");
    normalform = normalform.replace(" lels", " iela");
    normalform = normalform.replace("dzv. ", "");
    if (normalform.equalsIgnoreCase("Hitlera"))
        normalform = "hitlers";
    if (normalform.equalsIgnoreCase("Raia"))
        normalform = "rainis";
    if (normalform.equalsIgnoreCase("Vtola"))
        normalform = "vtols";
    if (normalform.equalsIgnoreCase("Mocarta"))
        normalform = "mocarts";
    if (normalform.equalsIgnoreCase("aikovska"))
        normalform = "?aikovskis";
    if (normalform.equalsIgnoreCase("Daugavpil"))
        normalform = "daugavpils";
    if (normalform.equalsIgnoreCase("Liep?jas"))
        normalform = "liep?ja";
    if (normalform.equalsIgnoreCase("Liep?j?"))
        normalform = "liep?ja";
    if (normalform.equalsIgnoreCase("blaumaa"))
        normalform = "blaumanis";
    if (normalform.equalsIgnoreCase("ns"))
        normalform = "na";
    if (normalform.equalsIgnoreCase("Polijas") || normalform.equalsIgnoreCase("Polijs"))
        normalform = "polija";
    if (normalform.endsWith("ijas"))
        normalform = normalform.substring(0, normalform.length() - 1);
    if (normalform.endsWith("ij?"))
        normalform = normalform.substring(0, normalform.length() - 1) + "a";

    if (blacklist.containsKey(word))
        return;
    if (blacklist.containsKey(normalform))
        return;
    if (normalform.contains(" un ") && !normalform.contains("opera"))
        return;
    if (normalform.length() > 50)
        return;

    String key = category + "|" + normalform;
    counter.incrementCount(key, mention_count);

    Counter<String> forms = popular_forms.get(key);
    if (forms == null) {
        forms = new IntCounter<String>();
        popular_forms.put(key, forms);
    }
    forms.incrementCount(word, mention_count);

    counterbydoc.incrementCount(key + "|" + doc, mention_count);
}

From source file:nate.reading.SlotInducer.java

/**
 * Creates an overall count of arguments summed across all of the given predicates.
 * The predicates are a list of IDs, indexing positions in the given names list.
 * @param ids A set of indices into the names list.
 * @param names The list of tokens that have arguments in our corpus.
 * @param argCounts The map from token to argument counts.
 */// w  w w  .j  ava  2  s  .c o  m
private Counter<String> sumArgs(Collection<String> names, VerbArgCounts argCounts) {
    Counter<String> sum = new IntCounter<String>();
    if (names != null && argCounts != null) {
        for (String slotname : names) {
            Map<String, Integer> subcounts = argCounts.getArgsForSlot(slotname);
            if (subcounts != null) {
                for (Map.Entry<String, Integer> entry : subcounts.entrySet())
                    sum.incrementCount(entry.getKey(), entry.getValue());
            }
        }
    }
    return sum;
}