Example usage for edu.stanford.nlp.util Generics newHashMap

Introduction

In this page you can find the example usage for edu.stanford.nlp.util Generics newHashMap.

Prototype

public static <K, V> Map<K, V> newHashMap()

Source Link

Usage

From source file:BuildBinarizedDataset.java

/**
 * Turns a text file into trees for use in a RNTN classifier such as
 * the treebank used in the Sentiment project.
 * <br>/*from ww w .  j  a v a  2  s  .  com*/
 * The expected input file is one sentence per line, with sentences
 * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
 * Lines after the first sentence line but before
 * the blank line will be treated as labeled sub-phrases.  The
 * labels should start with the label and then contain a list of
 * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
 *  For example:
 * <br>
 * <code>
 * 1 Today is not a good day.<br>
 * 3 good<br>
 * 3 good day <br>
 * 3 a good day <br>
 * <br>
 * (next block starts here) <br>
 * </code>
 * By default the englishPCFG parser is used.  This can be changed
 * with the <code>-parserModel</code> flag.  Specify an input file
 * with <code>-input</code>.
 * <br>
 * If a sentiment model is provided with -sentimentModel, that model
 * will be used to prelabel the sentences.  Any spans with given
 * labels will then be used to adjust those labels.
 */
public static void main(String[] arg) throws IOException {
    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
    // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true);
    String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String args[] = { "-input", "D:\\parse.txt", "-sentimentModel",
            "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" };
    String inputPath = "D:\\dataset\\good.txt";

    String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz";
    SentimentModel sentimentModel = null;

    /* for (int argIndex = 0; argIndex < args.length; ) {
       if (args[argIndex].equalsIgnoreCase("-input")) {
         inputPath = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
         parserModel = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
         sentimentModelPath = args[argIndex + 1];
         argIndex += 2;
       } else {
         System.err.println("Unknown argument " + args[argIndex]);
         System.exit(2);
       }
     }*/

    if (inputPath == null) {
        throw new IllegalArgumentException("Must specify input file with -input");
    }

    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(),
            parser.treebankLanguagePack());

    if (sentimentModelPath != null) {
        sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
    }

    String text = IOUtils.slurpFileNoExceptions(inputPath);
    String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk

    for (String chunk : chunks) {
        if (chunk.trim().isEmpty()) {
            continue;
        }
        // The expected format is that line 0 will be the text of the
        // sentence, and each subsequence line, if any, will be a value
        // followed by the sequence of tokens that get that value.

        // Here we take the first line and tokenize it as one sentence.
        String[] lines = chunk.trim().split("\\n");
        String sentence = lines[0];
        StringReader sin = new StringReader(sentence);
        DocumentPreprocessor document = new DocumentPreprocessor(sin);
        document.setSentenceFinalPuncWords(new String[] { "\n" });
        List<HasWord> tokens = document.iterator().next();
        Integer mainLabel = new Integer(tokens.get(0).word());
        //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
        tokens = tokens.subList(1, tokens.size());
        //System.err.println(tokens);

        Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
        for (int i = 1; i < lines.length; ++i) {
            extractLabels(spanToLabels, tokens, lines[i]);
        }

        // TODO: add an option which treats the spans as constraints when parsing

        Tree tree = parser.apply(tokens);
        Tree binarized = binarizer.transformTree(tree);
        Tree collapsedUnary = transformer.transformTree(binarized);

        // if there is a sentiment model for use in prelabeling, we
        // label here and then use the user given labels to adjust
        if (sentimentModel != null) {
            Trees.convertToCoreLabels(collapsedUnary);
            SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
            scorer.forwardPropagateTree(collapsedUnary);
            setPredictedLabels(collapsedUnary);
        } else {
            setUnknownLabels(collapsedUnary, mainLabel);
            //collapsedUnary.label().setValue(mainLabel.toString());
            //System.out.println("Root"+collapsedUnary.getNodeNumber(1));
        }

        Trees.convertToCoreLabels(collapsedUnary);
        collapsedUnary.indexSpans();

        for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
            setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
        }
        String x = collapsedUnary.toString();
        //x.replaceAll("\\s","");
        x = x.replace("(", "[");
        x = x.replace(")", "]");
        //writer.write(x);
        //writer.write("\r\n"); 
        System.out.println(x);
        //System.out.println();
    }
    //writer.close();
}

From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java

License:Open Source License

@Override
public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)",
            Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset))
        return null;

    currentOffset = docMatcher.end();//from   ww w .  j a v  a2 s.  c o m
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    //Maintain current document ID.
    Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find())
        currentDocumentID = docIDMatcher.group(1);
    else
        currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
        String sentenceString = sentenceMatcher.group(2);
        List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString), "invertible")
                .tokenize();

        // FIXING TOKENIZATION PROBLEMS
        for (int i = 0; i < words.size(); i++) {
            CoreLabel w = words.get(i);
            if (i > 0 && w.word().equals("$")) {
                if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
                    continue;
                words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
                words.remove(i);
                i--;
            } else if (w.word().equals("\\/")) {
                if (words.get(i - 1).word().equals("</COREF>"))
                    continue;
                w.set(CoreAnnotations.TextAnnotation.class,
                        words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
                words.remove(i + 1);
                words.remove(i - 1);
            }
        }
        // END FIXING TOKENIZATION PROBLEMS

        List<CoreLabel> sentence = new ArrayList<CoreLabel>();
        // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open
        Stack<Mention> stack = new Stack<Mention>();
        List<Mention> mentions = new ArrayList<Mention>();

        allWords.add(sentence);
        allGoldMentions.add(mentions);

        for (CoreLabel word : words) {
            String w = word.get(CoreAnnotations.TextAnnotation.class);
            // found regular token: WORD/POS
            if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
                int i = w.lastIndexOf("\\/");
                String w1 = w.substring(0, i);
                // we do NOT set POS info here. We take the POS tags from the parser!
                word.set(CoreAnnotations.TextAnnotation.class, w1);
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
            // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
            else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
                Pattern nerPattern = Pattern.compile("<(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                ner = m.group(1);
            }
            // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
            else if (w.startsWith("</") && !w.startsWith("</COREF")) {
                Pattern nerPattern = Pattern.compile("</(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                String ner1 = m.group(1);
                if (ner != null && !ner.equals(ner1))
                    throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
                ner = null;
            }
            // found the start SGML tag for a coref mention
            else if (w.startsWith("<COREF")) {
                Mention mention = new Mention();
                // position of this mention in the sentence
                mention.startIndex = sentence.size();

                // extract GOLD info about this coref chain. needed for eval
                Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
                Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

                Matcher m = idPattern.matcher(w);
                m.find();
                mention.mentionID = Integer.valueOf(m.group(1));

                m = refPattern.matcher(w);
                if (m.find()) {
                    mention.originalRef = Integer.valueOf(m.group(1));
                }

                // open mention. keep track of all open mentions using the stack
                stack.push(mention);
            }
            // found the end SGML tag for a coref mention
            else if (w.equals("</COREF>")) {
                Mention mention = stack.pop();
                mention.endIndex = sentence.size();

                // this is a closed mention. add it to the final list of mentions
                // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef);
                mentions.add(mention);
            } else {
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
        }
        StringBuilder textContent = new StringBuilder();
        for (int i = 0; i < sentence.size(); i++) {
            CoreLabel w = sentence.get(i);
            w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
            w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
            if (i > 0)
                textContent.append(" ");
            textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
        }
        CoreMap sentCoreMap = new Annotation(textContent.toString());
        allSentences.add(sentCoreMap);
        sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            idMention.put(m.mentionID, m);
        }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            if (m.goldCorefClusterID == -1) {
                if (m.originalRef == -1)
                    m.goldCorefClusterID = m.mentionID;
                else {
                    int ref = m.originalRef;
                    while (true) {
                        Mention m2 = idMention.get(ref);
                        if (m2.goldCorefClusterID != -1) {
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else if (m2.originalRef == -1) {
                            m2.goldCorefClusterID = m2.mentionID;
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else {
                            ref = m2.originalRef;
                        }
                    }
                }
            }
        }
    }

    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);

    if (allSentences.size() != allWords.size())
        throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
        List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
        List<CoreLabel> unannotatedSent = allWords.get(i);
        List<Mention> mentionInSent = allGoldMentions.get(i);
        for (Mention m : mentionInSent) {
            m.dependency = allSentences.get(i)
                    .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        }
        if (annotatedSent.size() != unannotatedSent.size()) {
            throw new IllegalStateException("annotatedSent != unannotatedSent");
        }
        for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
            CoreLabel annotatedWord = annotatedSent.get(j);
            CoreLabel unannotatedWord = unannotatedSent.get(j);
            if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class)
                    .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
                throw new IllegalStateException("annotatedWord != unannotatedWord");
            }
        }
        allWords.set(i, annotatedSent);
        allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // term things

    List<List<Mention>> termMentions = new ArrayList<List<Mention>>();

    if (use_term) {
        String dataCrf = "";
        System.err.print("FEAT TYPE: ");
        System.err
                .println(props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP));
        if (props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP)
                .equals(MyConstants.TTE_FEATURE_NLTK)) {
            dataCrf = NltkCrfFormatter.annotationToCrfString(docAnno);
        } else {
            dataCrf = CrfFormatter.annotationToCrfString(docAnno);
        }
        List<List<String>> tagResult = new ArrayList<List<String>>();

        try {
            tagResult = CrfsuiteCaller.tag(dataCrf, props.getProperty(MyConstants.TTE_MODEL));

            if (props.containsKey(MyConstants.TTE_SAVE_CRF_DATA)) {
                String crfDataFilename = props.getProperty(MyConstants.TTE_SAVE_CRF_DATA);

                File crfDataFile = new File(crfDataFilename);
                BufferedWriter bw = new BufferedWriter(new FileWriter(crfDataFile));
                bw.write(dataCrf);
                bw.close();
            }

        } catch (Exception e) {
            System.err.println("Crfsuite tag failed");
        }

        termAsMentionFinder.setTags(tagResult);
        termMentions = termAsMentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

        maxID = termAsMentionFinder.getMaxID();
    }

    // extract predicted mentions

    allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

    if (use_term && props.containsKey(MyConstants.TTE_KEEP_PRON)) {
        termMentions = injectPronoun(termMentions, allPredictedMentions);
    }

    if (experimentType != null) {
        if (experimentType.equals(MyConstants.EXP_TYPE_03_UNION)) {
            List<List<Mention>> usingMentions = unionMentions(allPredictedMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (experimentType.equals(MyConstants.EXP_TYPE_03_INTERSECT)) {
            List<List<Mention>> usingMentions = intersectMentions(allPredictedMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_CHECK)) {
            allPredictedMentions = termMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_SUPER)) {
            List<List<Mention>> usingMentions = superstringMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_OVERLAP)) {
            List<List<Mention>> usingMentions = overlapMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_UNION)) {
            List<List<Mention>> usingMentions = unionMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_SUPER)) {
            List<List<Mention>> usingMentions = superstringMentions(termMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_OVERLAP)) {
            List<List<Mention>> usingMentions = overlapMentions(termMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else {
            System.err.println(experimentType);
            System.err.println("Unknown experiment type. Using mention detector.");
        }
    } else if (useGoldMention) {
        allPredictedMentions = allGoldMentions;
    }

    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}

From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java

License:Open Source License

private void initLexicon(SeqClassifierFlags flags) {
    if (flags.distSimLexicon == null) {
        return;/*from w w  w  .j a  v a 2s . co  m*/
    }
    if (lexicon != null) {
        return;
    }
    Timing.startDoing("Loading distsim lexicon from " + flags.distSimLexicon);
    lexicon = Generics.newHashMap();
    boolean terryKoo = "terryKoo".equals(flags.distSimFileFormat);
    for (String line : ObjectBank.getLineIterator(flags.distSimLexicon, flags.inputEncoding)) {
        String word;
        String wordClass;
        if (terryKoo) {
            String[] bits = line.split("\\t");
            word = bits[1];
            wordClass = bits[0];
            if (flags.distSimMaxBits > 0 && wordClass.length() > flags.distSimMaxBits) {
                wordClass = wordClass.substring(0, flags.distSimMaxBits);
            }
        } else {
            // "alexClark"
            String[] bits = line.split("\\s+");
            word = bits[0];
            wordClass = bits[1];
        }
        if (!flags.casedDistSim) {
            word = word.toLowerCase();
        }
        if (flags.numberEquivalenceDistSim) {
            word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS);
        }
        lexicon.put(word, wordClass);
    }
    Timing.endDoing();
}

From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java

License:Open Source License

public void clearMemory() {
    wordToSubstrings = Generics.newHashMap();
    lexicon = null;
}

From source file:knu.univ.lingvo.coref.ACEMentionExtractor.java

License:Open Source License

private static void printRawDoc(List<CoreMap> sentences, List<List<Mention>> allMentions, String filename,
        boolean gold) throws FileNotFoundException {
    StringBuilder doc = new StringBuilder();
    int previousOffset = 0;
    Counter<Integer> mentionCount = new ClassicCounter<Integer>();
    for (List<Mention> l : allMentions) {
        for (Mention m : l) {
            mentionCount.incrementCount(m.goldCorefClusterID);
        }/*  w  w w . ja  v a2s .  co  m*/
    }

    for (int i = 0; i < sentences.size(); i++) {
        CoreMap sentence = sentences.get(i);
        List<Mention> mentions = allMentions.get(i);

        String[] tokens = sentence.get(CoreAnnotations.TextAnnotation.class).split(" ");
        String sent = "";
        List<CoreLabel> t = sentence.get(CoreAnnotations.TokensAnnotation.class);
        if (previousOffset + 2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class))
            sent += "\n";
        previousOffset = t.get(t.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        Counter<Integer> startCounts = new ClassicCounter<Integer>();
        Counter<Integer> endCounts = new ClassicCounter<Integer>();
        Map<Integer, Set<Integer>> endID = Generics.newHashMap();
        for (Mention m : mentions) {
            startCounts.incrementCount(m.startIndex);
            endCounts.incrementCount(m.endIndex);
            if (!endID.containsKey(m.endIndex))
                endID.put(m.endIndex, Generics.<Integer>newHashSet());
            endID.get(m.endIndex).add(m.goldCorefClusterID);
        }
        for (int j = 0; j < tokens.length; j++) {
            if (endID.containsKey(j)) {
                for (Integer id : endID.get(j)) {
                    if (mentionCount.getCount(id) != 1 && gold)
                        sent += "]_" + id;
                    else
                        sent += "]";
                }
            }
            for (int k = 0; k < startCounts.getCount(j); k++) {
                if (!sent.endsWith("["))
                    sent += " ";
                sent += "[";
            }
            sent += " ";
            sent = sent + tokens[j];
        }
        for (int k = 0; k < endCounts.getCount(tokens.length); k++) {
            sent += "]";
        }
        sent += "\n";
        doc.append(sent);
    }
    if (gold)
        logger.fine("New DOC: (GOLD MENTIONS) ==================================================");
    else
        logger.fine("New DOC: (Predicted Mentions) ==================================================");
    logger.fine(doc.toString());
}

From source file:knu.univ.lingvo.coref.CorefChain.java

License:Open Source License

public CorefChain(CorefCluster c, Map<Mention, IntTuple> positions) {
    chainID = c.clusterID;//ww w.j a v  a2s.c om
    // Collect mentions
    mentions = new ArrayList<CorefMention>();
    mentionMap = Generics.newHashMap();
    CorefMention represents = null;
    for (Mention m : c.getCorefMentions()) {
        CorefMention men = new CorefMention(m, positions.get(m));
        mentions.add(men);
    }
    Collections.sort(mentions, new CorefMentionComparator());
    // Find representative mention
    for (CorefMention men : mentions) {
        IntPair position = new IntPair(men.sentNum, men.headIndex);
        if (!mentionMap.containsKey(position))
            mentionMap.put(position, Generics.<CorefMention>newHashSet());
        mentionMap.get(position).add(men);
        if (men.moreRepresentativeThan(represents)) {
            represents = men;
        }
    }
    representative = represents;
}

From source file:knu.univ.lingvo.coref.Document.java

License:Open Source License

public Document() {
    positions = Generics.newHashMap();
    mentionheadPositions = Generics.newHashMap();
    roleSet = Generics.newHashSet();/*from   w  ww  .j  av  a2 s . c om*/
    corefClusters = Generics.newHashMap();
    goldCorefClusters = null;
    allPredictedMentions = Generics.newHashMap();
    allGoldMentions = Generics.newHashMap();
    speakers = Generics.newHashMap();
    speakerPairs = Generics.newHashSet();
    incompatibles = TwoDimensionalSet.hashSet();
    incompatibleClusters = TwoDimensionalSet.hashSet();
    acronymCache = TwoDimensionalMap.hashMap();
}

From source file:knu.univ.lingvo.coref.Document.java

License:Open Source License

/** Mark twin mentions: heads of the mentions are matched */
private void findTwinMentionsRelaxed() {
    for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
        List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
        List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);

        Map<IntPair, Mention> goldMentionPositions = Generics.newHashMap();
        Map<Integer, LinkedList<Mention>> goldMentionHeadPositions = Generics.newHashMap();
        for (Mention g : golds) {
            goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g);
            if (!goldMentionHeadPositions.containsKey(g.headIndex)) {
                goldMentionHeadPositions.put(g.headIndex, new LinkedList<Mention>());
            }//  www.j  a  v  a 2 s .  c o m
            goldMentionHeadPositions.get(g.headIndex).add(g);
        }

        List<Mention> remains = new ArrayList<Mention>();
        for (Mention p : predicts) {
            IntPair pos = new IntPair(p.startIndex, p.endIndex);
            if (goldMentionPositions.containsKey(pos)) {
                Mention g = goldMentionPositions.get(pos);
                p.mentionID = g.mentionID;
                p.twinless = false;
                g.twinless = false;
                goldMentionHeadPositions.get(g.headIndex).remove(g);
                if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
                    goldMentionHeadPositions.remove(g.headIndex);
                }
            } else
                remains.add(p);
        }
        for (Mention r : remains) {
            if (goldMentionHeadPositions.containsKey(r.headIndex)) {
                Mention g = goldMentionHeadPositions.get(r.headIndex).poll();
                r.mentionID = g.mentionID;
                r.twinless = false;
                g.twinless = false;
                if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
                    goldMentionHeadPositions.remove(g.headIndex);
                }
            }
        }
    }
}

From source file:knu.univ.lingvo.coref.Document.java

License:Open Source License

/** Extract gold coref cluster information. */
public void extractGoldCorefClusters() {
    goldCorefClusters = Generics.newHashMap();
    for (List<Mention> mentions : goldOrderedMentionsBySentence) {
        for (Mention m : mentions) {
            int id = m.goldCorefClusterID;
            if (id == -1) {
                throw new RuntimeException("No gold info");
            }//from   www  .j  av  a2  s  . com
            CorefCluster c = goldCorefClusters.get(id);
            if (c == null) {
                c = new CorefCluster(id);
                goldCorefClusters.put(id, c);
            }
            c.corefMentions.add(m);
        }
    }
}

From source file:knu.univ.lingvo.coref.Document.java

License:Open Source License

/** Extract gold coref link information */
protected void extractGoldLinks() {
    //    List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions();
    List<Pair<IntTuple, IntTuple>> links = new ArrayList<Pair<IntTuple, IntTuple>>();

    // position of each mention in the input matrix, by id
    Map<Integer, IntTuple> positions = Generics.newHashMap();
    // positions of antecedents
    Map<Integer, List<IntTuple>> antecedents = Generics.newHashMap();
    for (int i = 0; i < goldOrderedMentionsBySentence.size(); i++) {
        for (int j = 0; j < goldOrderedMentionsBySentence.get(i).size(); j++) {
            Mention m = goldOrderedMentionsBySentence.get(i).get(j);
            int id = m.mentionID;
            IntTuple pos = new IntTuple(2);
            pos.set(0, i);// w w w .ja  va  2  s. c  o  m
            pos.set(1, j);
            positions.put(id, pos);
            antecedents.put(id, new ArrayList<IntTuple>());
        }
    }

    //    SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence);
    for (List<Mention> mentions : goldOrderedMentionsBySentence) {
        for (Mention m : mentions) {
            int id = m.mentionID;
            IntTuple src = positions.get(id);

            assert (src != null);
            if (m.originalRef >= 0) {
                IntTuple dst = positions.get(m.originalRef);
                if (dst == null) {
                    throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef);
                }

                // to deal with cataphoric annotation
                while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) {
                    Mention dstMention = goldOrderedMentionsBySentence.get(dst.get(0)).get(dst.get(1));
                    m.originalRef = dstMention.originalRef;
                    dstMention.originalRef = id;

                    if (m.originalRef < 0)
                        break;
                    dst = positions.get(m.originalRef);
                }
                if (m.originalRef < 0)
                    continue;

                // A B C: if A<-B, A<-C => make a link B<-C
                for (int k = dst.get(0); k <= src.get(0); k++) {
                    for (int l = 0; l < goldOrderedMentionsBySentence.get(k).size(); l++) {
                        if (k == dst.get(0) && l < dst.get(1))
                            continue;
                        if (k == src.get(0) && l > src.get(1))
                            break;
                        IntTuple missed = new IntTuple(2);
                        missed.set(0, k);
                        missed.set(1, l);
                        if (links.contains(new Pair<IntTuple, IntTuple>(missed, dst))) {
                            antecedents.get(id).add(missed);
                            links.add(new Pair<IntTuple, IntTuple>(src, missed));
                        }
                    }
                }

                links.add(new Pair<IntTuple, IntTuple>(src, dst));

                assert (antecedents.get(id) != null);
                antecedents.get(id).add(dst);

                List<IntTuple> ants = antecedents.get(m.originalRef);
                assert (ants != null);
                for (IntTuple ant : ants) {
                    antecedents.get(id).add(ant);
                    links.add(new Pair<IntTuple, IntTuple>(src, ant));
                }
            }
        }
    }
    goldLinks = links;
}