Example usage for edu.stanford.nlp.util Generics newHashMap

List of usage examples for edu.stanford.nlp.util Generics newHashMap

Introduction

In this page you can find the example usage for edu.stanford.nlp.util Generics newHashMap.

Prototype

public static <K, V> Map<K, V> newHashMap() 

Source Link

Usage

From source file:BuildBinarizedDataset.java

/**
 * Turns a text file into trees for use in a RNTN classifier such as
 * the treebank used in the Sentiment project.
 * <br>/*from ww w .  j  a v a  2  s  .  com*/
 * The expected input file is one sentence per line, with sentences
 * separated by blank lines. The first line has the main label of the sentence together with the full sentence.
 * Lines after the first sentence line but before
 * the blank line will be treated as labeled sub-phrases.  The
 * labels should start with the label and then contain a list of
 * tokens the label applies to. All phrases that do not have their own label will take on the main sentence label!
 *  For example:
 * <br>
 * <code>
 * 1 Today is not a good day.<br>
 * 3 good<br>
 * 3 good day <br>
 * 3 a good day <br>
 * <br>
 * (next block starts here) <br>
 * </code>
 * By default the englishPCFG parser is used.  This can be changed
 * with the <code>-parserModel</code> flag.  Specify an input file
 * with <code>-input</code>.
 * <br>
 * If a sentiment model is provided with -sentimentModel, that model
 * will be used to prelabel the sentences.  Any spans with given
 * labels will then be used to adjust those labels.
 */
public static void main(String[] arg) throws IOException {
    CollapseUnaryTransformer transformer = new CollapseUnaryTransformer();
    // FileWriter writer = new FileWriter("D:\\dataset\\train.txt", true);
    String parserModel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
    String args[] = { "-input", "D:\\parse.txt", "-sentimentModel",
            "edu/stanford/nlp/models/sentiment/sentiment.ser.gz" };
    String inputPath = "D:\\dataset\\good.txt";

    String sentimentModelPath = "edu/stanford/nlp/models/sentiment/sentiment.ser.gz";
    SentimentModel sentimentModel = null;

    /* for (int argIndex = 0; argIndex < args.length; ) {
       if (args[argIndex].equalsIgnoreCase("-input")) {
         inputPath = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-parserModel")) {
         parserModel = args[argIndex + 1];
         argIndex += 2;
       } else if (args[argIndex].equalsIgnoreCase("-sentimentModel")) {
         sentimentModelPath = args[argIndex + 1];
         argIndex += 2;
       } else {
         System.err.println("Unknown argument " + args[argIndex]);
         System.exit(2);
       }
     }*/

    if (inputPath == null) {
        throw new IllegalArgumentException("Must specify input file with -input");
    }

    LexicalizedParser parser = LexicalizedParser.loadModel(parserModel);
    TreeBinarizer binarizer = TreeBinarizer.simpleTreeBinarizer(parser.getTLPParams().headFinder(),
            parser.treebankLanguagePack());

    if (sentimentModelPath != null) {
        sentimentModel = SentimentModel.loadSerialized(sentimentModelPath);
    }

    String text = IOUtils.slurpFileNoExceptions(inputPath);
    String[] chunks = text.split("\\n\\s*\\n+"); // need blank line to make a new chunk

    for (String chunk : chunks) {
        if (chunk.trim().isEmpty()) {
            continue;
        }
        // The expected format is that line 0 will be the text of the
        // sentence, and each subsequence line, if any, will be a value
        // followed by the sequence of tokens that get that value.

        // Here we take the first line and tokenize it as one sentence.
        String[] lines = chunk.trim().split("\\n");
        String sentence = lines[0];
        StringReader sin = new StringReader(sentence);
        DocumentPreprocessor document = new DocumentPreprocessor(sin);
        document.setSentenceFinalPuncWords(new String[] { "\n" });
        List<HasWord> tokens = document.iterator().next();
        Integer mainLabel = new Integer(tokens.get(0).word());
        //System.out.print("Main Sentence Label: " + mainLabel.toString() + "; ");
        tokens = tokens.subList(1, tokens.size());
        //System.err.println(tokens);

        Map<Pair<Integer, Integer>, String> spanToLabels = Generics.newHashMap();
        for (int i = 1; i < lines.length; ++i) {
            extractLabels(spanToLabels, tokens, lines[i]);
        }

        // TODO: add an option which treats the spans as constraints when parsing

        Tree tree = parser.apply(tokens);
        Tree binarized = binarizer.transformTree(tree);
        Tree collapsedUnary = transformer.transformTree(binarized);

        // if there is a sentiment model for use in prelabeling, we
        // label here and then use the user given labels to adjust
        if (sentimentModel != null) {
            Trees.convertToCoreLabels(collapsedUnary);
            SentimentCostAndGradient scorer = new SentimentCostAndGradient(sentimentModel, null);
            scorer.forwardPropagateTree(collapsedUnary);
            setPredictedLabels(collapsedUnary);
        } else {
            setUnknownLabels(collapsedUnary, mainLabel);
            //collapsedUnary.label().setValue(mainLabel.toString());
            //System.out.println("Root"+collapsedUnary.getNodeNumber(1));
        }

        Trees.convertToCoreLabels(collapsedUnary);
        collapsedUnary.indexSpans();

        for (Map.Entry<Pair<Integer, Integer>, String> pairStringEntry : spanToLabels.entrySet()) {
            setSpanLabel(collapsedUnary, pairStringEntry.getKey(), pairStringEntry.getValue());
        }
        String x = collapsedUnary.toString();
        //x.replaceAll("\\s","");
        x = x.replace("(", "[");
        x = x.replace(")", "]");
        //writer.write(x);
        //writer.write("\r\n"); 
        System.out.println(x);
        //System.out.println();
    }
    //writer.close();
}

From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java

License:Open Source License

@Override
public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)",
            Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset))
        return null;

    currentOffset = docMatcher.end();//from   ww w .  j a v  a2 s.  c o m
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    //Maintain current document ID.
    Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find())
        currentDocumentID = docIDMatcher.group(1);
    else
        currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
        String sentenceString = sentenceMatcher.group(2);
        List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString), "invertible")
                .tokenize();

        // FIXING TOKENIZATION PROBLEMS
        for (int i = 0; i < words.size(); i++) {
            CoreLabel w = words.get(i);
            if (i > 0 && w.word().equals("$")) {
                if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
                    continue;
                words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
                words.remove(i);
                i--;
            } else if (w.word().equals("\\/")) {
                if (words.get(i - 1).word().equals("</COREF>"))
                    continue;
                w.set(CoreAnnotations.TextAnnotation.class,
                        words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
                words.remove(i + 1);
                words.remove(i - 1);
            }
        }
        // END FIXING TOKENIZATION PROBLEMS

        List<CoreLabel> sentence = new ArrayList<CoreLabel>();
        // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open
        Stack<Mention> stack = new Stack<Mention>();
        List<Mention> mentions = new ArrayList<Mention>();

        allWords.add(sentence);
        allGoldMentions.add(mentions);

        for (CoreLabel word : words) {
            String w = word.get(CoreAnnotations.TextAnnotation.class);
            // found regular token: WORD/POS
            if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
                int i = w.lastIndexOf("\\/");
                String w1 = w.substring(0, i);
                // we do NOT set POS info here. We take the POS tags from the parser!
                word.set(CoreAnnotations.TextAnnotation.class, w1);
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
            // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
            else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
                Pattern nerPattern = Pattern.compile("<(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                ner = m.group(1);
            }
            // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
            else if (w.startsWith("</") && !w.startsWith("</COREF")) {
                Pattern nerPattern = Pattern.compile("</(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                String ner1 = m.group(1);
                if (ner != null && !ner.equals(ner1))
                    throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
                ner = null;
            }
            // found the start SGML tag for a coref mention
            else if (w.startsWith("<COREF")) {
                Mention mention = new Mention();
                // position of this mention in the sentence
                mention.startIndex = sentence.size();

                // extract GOLD info about this coref chain. needed for eval
                Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
                Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

                Matcher m = idPattern.matcher(w);
                m.find();
                mention.mentionID = Integer.valueOf(m.group(1));

                m = refPattern.matcher(w);
                if (m.find()) {
                    mention.originalRef = Integer.valueOf(m.group(1));
                }

                // open mention. keep track of all open mentions using the stack
                stack.push(mention);
            }
            // found the end SGML tag for a coref mention
            else if (w.equals("</COREF>")) {
                Mention mention = stack.pop();
                mention.endIndex = sentence.size();

                // this is a closed mention. add it to the final list of mentions
                // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef);
                mentions.add(mention);
            } else {
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
        }
        StringBuilder textContent = new StringBuilder();
        for (int i = 0; i < sentence.size(); i++) {
            CoreLabel w = sentence.get(i);
            w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
            w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
            if (i > 0)
                textContent.append(" ");
            textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
        }
        CoreMap sentCoreMap = new Annotation(textContent.toString());
        allSentences.add(sentCoreMap);
        sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            idMention.put(m.mentionID, m);
        }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            if (m.goldCorefClusterID == -1) {
                if (m.originalRef == -1)
                    m.goldCorefClusterID = m.mentionID;
                else {
                    int ref = m.originalRef;
                    while (true) {
                        Mention m2 = idMention.get(ref);
                        if (m2.goldCorefClusterID != -1) {
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else if (m2.originalRef == -1) {
                            m2.goldCorefClusterID = m2.mentionID;
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else {
                            ref = m2.originalRef;
                        }
                    }
                }
            }
        }
    }

    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);

    if (allSentences.size() != allWords.size())
        throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
        List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
        List<CoreLabel> unannotatedSent = allWords.get(i);
        List<Mention> mentionInSent = allGoldMentions.get(i);
        for (Mention m : mentionInSent) {
            m.dependency = allSentences.get(i)
                    .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        }
        if (annotatedSent.size() != unannotatedSent.size()) {
            throw new IllegalStateException("annotatedSent != unannotatedSent");
        }
        for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
            CoreLabel annotatedWord = annotatedSent.get(j);
            CoreLabel unannotatedWord = unannotatedSent.get(j);
            if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class)
                    .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
                throw new IllegalStateException("annotatedWord != unannotatedWord");
            }
        }
        allWords.set(i, annotatedSent);
        allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // term things

    List<List<Mention>> termMentions = new ArrayList<List<Mention>>();

    if (use_term) {
        String dataCrf = "";
        System.err.print("FEAT TYPE: ");
        System.err
                .println(props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP));
        if (props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP)
                .equals(MyConstants.TTE_FEATURE_NLTK)) {
            dataCrf = NltkCrfFormatter.annotationToCrfString(docAnno);
        } else {
            dataCrf = CrfFormatter.annotationToCrfString(docAnno);
        }
        List<List<String>> tagResult = new ArrayList<List<String>>();

        try {
            tagResult = CrfsuiteCaller.tag(dataCrf, props.getProperty(MyConstants.TTE_MODEL));

            if (props.containsKey(MyConstants.TTE_SAVE_CRF_DATA)) {
                String crfDataFilename = props.getProperty(MyConstants.TTE_SAVE_CRF_DATA);

                File crfDataFile = new File(crfDataFilename);
                BufferedWriter bw = new BufferedWriter(new FileWriter(crfDataFile));
                bw.write(dataCrf);
                bw.close();
            }

        } catch (Exception e) {
            System.err.println("Crfsuite tag failed");
        }

        termAsMentionFinder.setTags(tagResult);
        termMentions = termAsMentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

        maxID = termAsMentionFinder.getMaxID();
    }

    // extract predicted mentions

    allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

    if (use_term && props.containsKey(MyConstants.TTE_KEEP_PRON)) {
        termMentions = injectPronoun(termMentions, allPredictedMentions);
    }

    if (experimentType != null) {
        if (experimentType.equals(MyConstants.EXP_TYPE_03_UNION)) {
            List<List<Mention>> usingMentions = unionMentions(allPredictedMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (experimentType.equals(MyConstants.EXP_TYPE_03_INTERSECT)) {
            List<List<Mention>> usingMentions = intersectMentions(allPredictedMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_CHECK)) {
            allPredictedMentions = termMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_SUPER)) {
            List<List<Mention>> usingMentions = superstringMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_OVERLAP)) {
            List<List<Mention>> usingMentions = overlapMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_UNION)) {
            List<List<Mention>> usingMentions = unionMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_SUPER)) {
            List<List<Mention>> usingMentions = superstringMentions(termMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_OVERLAP)) {
            List<List<Mention>> usingMentions = overlapMentions(termMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else {
            System.err.println(experimentType);
            System.err.println("Unknown experiment type. Using mention detector.");
        }
    } else if (useGoldMention) {
        allPredictedMentions = allGoldMentions;
    }

    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}

From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java

License:Open Source License

private void initLexicon(SeqClassifierFlags flags) {
    if (flags.distSimLexicon == null) {
        return;/*from w w  w  .j a  v a 2s . co  m*/
    }
    if (lexicon != null) {
        return;
    }
    Timing.startDoing("Loading distsim lexicon from " + flags.distSimLexicon);
    lexicon = Generics.newHashMap();
    boolean terryKoo = "terryKoo".equals(flags.distSimFileFormat);
    for (String line : ObjectBank.getLineIterator(flags.distSimLexicon, flags.inputEncoding)) {
        String word;
        String wordClass;
        if (terryKoo) {
            String[] bits = line.split("\\t");
            word = bits[1];
            wordClass = bits[0];
            if (flags.distSimMaxBits > 0 && wordClass.length() > flags.distSimMaxBits) {
                wordClass = wordClass.substring(0, flags.distSimMaxBits);
            }
        } else {
            // "alexClark"
            String[] bits = line.split("\\s+");
            word = bits[0];
            wordClass = bits[1];
        }
        if (!flags.casedDistSim) {
            word = word.toLowerCase();
        }
        if (flags.numberEquivalenceDistSim) {
            word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS);
        }
        lexicon.put(word, wordClass);
    }
    Timing.endDoing();
}

From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java

License:Open Source License

public void clearMemory() {
    wordToSubstrings = Generics.newHashMap();
    lexicon = null;
}

From source file:knu.univ.lingvo.coref.ACEMentionExtractor.java

License:Open Source License

private static void printRawDoc(List<CoreMap> sentences, List<List<Mention>> allMentions, String filename,
        boolean gold) throws FileNotFoundException {
    StringBuilder doc = new StringBuilder();
    int previousOffset = 0;
    Counter<Integer> mentionCount = new ClassicCounter<Integer>();
    for (List<Mention> l : allMentions) {
        for (Mention m : l) {
            mentionCount.incrementCount(m.goldCorefClusterID);
        }/*  w  w w . ja  v a2s .  co  m*/
    }

    for (int i = 0; i < sentences.size(); i++) {
        CoreMap sentence = sentences.get(i);
        List<Mention> mentions = allMentions.get(i);

        String[] tokens = sentence.get(CoreAnnotations.TextAnnotation.class).split(" ");
        String sent = "";
        List<CoreLabel> t = sentence.get(CoreAnnotations.TokensAnnotation.class);
        if (previousOffset + 2 < t.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class))
            sent += "\n";
        previousOffset = t.get(t.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        Counter<Integer> startCounts = new ClassicCounter<Integer>();
        Counter<Integer> endCounts = new ClassicCounter<Integer>();
        Map<Integer, Set<Integer>> endID = Generics.newHashMap();
        for (Mention m : mentions) {
            startCounts.incrementCount(m.startIndex);
            endCounts.incrementCount(m.endIndex);
            if (!endID.containsKey(m.endIndex))
                endID.put(m.endIndex, Generics.<Integer>newHashSet());
            endID.get(m.endIndex).add(m.goldCorefClusterID);
        }
        for (int j = 0; j < tokens.length; j++) {
            if (endID.containsKey(j)) {
                for (Integer id : endID.get(j)) {
                    if (mentionCount.getCount(id) != 1 && gold)
                        sent += "]_" + id;
                    else
                        sent += "]";
                }
            }
            for (int k = 0; k < startCounts.getCount(j); k++) {
                if (!sent.endsWith("["))
                    sent += " ";
                sent += "[";
            }
            sent += " ";
            sent = sent + tokens[j];
        }
        for (int k = 0; k < endCounts.getCount(tokens.length); k++) {
            sent += "]";
        }
        sent += "\n";
        doc.append(sent);
    }
    if (gold)
        logger.fine("New DOC: (GOLD MENTIONS) ==================================================");
    else
        logger.fine("New DOC: (Predicted Mentions) ==================================================");
    logger.fine(doc.toString());
}

From source file:knu.univ.lingvo.coref.CorefChain.java

License:Open Source License

public CorefChain(CorefCluster c, Map<Mention, IntTuple> positions) {
    chainID = c.clusterID;//ww w.j a v  a2s.c om
    // Collect mentions
    mentions = new ArrayList<CorefMention>();
    mentionMap = Generics.newHashMap();
    CorefMention represents = null;
    for (Mention m : c.getCorefMentions()) {
        CorefMention men = new CorefMention(m, positions.get(m));
        mentions.add(men);
    }
    Collections.sort(mentions, new CorefMentionComparator());
    // Find representative mention
    for (CorefMention men : mentions) {
        IntPair position = new IntPair(men.sentNum, men.headIndex);
        if (!mentionMap.containsKey(position))
            mentionMap.put(position, Generics.<CorefMention>newHashSet());
        mentionMap.get(position).add(men);
        if (men.moreRepresentativeThan(represents)) {
            represents = men;
        }
    }
    representative = represents;
}

From source file:knu.univ.lingvo.coref.Document.java

License:Open Source License

public Document() {
    positions = Generics.newHashMap();
    mentionheadPositions = Generics.newHashMap();
    roleSet = Generics.newHashSet();/*from   w  ww  .j  av  a2 s . c om*/
    corefClusters = Generics.newHashMap();
    goldCorefClusters = null;
    allPredictedMentions = Generics.newHashMap();
    allGoldMentions = Generics.newHashMap();
    speakers = Generics.newHashMap();
    speakerPairs = Generics.newHashSet();
    incompatibles = TwoDimensionalSet.hashSet();
    incompatibleClusters = TwoDimensionalSet.hashSet();
    acronymCache = TwoDimensionalMap.hashMap();
}

From source file:knu.univ.lingvo.coref.Document.java

License:Open Source License

/** Mark twin mentions: heads of the mentions are matched */
private void findTwinMentionsRelaxed() {
    for (int sentNum = 0; sentNum < goldOrderedMentionsBySentence.size(); sentNum++) {
        List<Mention> golds = goldOrderedMentionsBySentence.get(sentNum);
        List<Mention> predicts = predictedOrderedMentionsBySentence.get(sentNum);

        Map<IntPair, Mention> goldMentionPositions = Generics.newHashMap();
        Map<Integer, LinkedList<Mention>> goldMentionHeadPositions = Generics.newHashMap();
        for (Mention g : golds) {
            goldMentionPositions.put(new IntPair(g.startIndex, g.endIndex), g);
            if (!goldMentionHeadPositions.containsKey(g.headIndex)) {
                goldMentionHeadPositions.put(g.headIndex, new LinkedList<Mention>());
            }//  www.j  a  v  a 2 s .  c o m
            goldMentionHeadPositions.get(g.headIndex).add(g);
        }

        List<Mention> remains = new ArrayList<Mention>();
        for (Mention p : predicts) {
            IntPair pos = new IntPair(p.startIndex, p.endIndex);
            if (goldMentionPositions.containsKey(pos)) {
                Mention g = goldMentionPositions.get(pos);
                p.mentionID = g.mentionID;
                p.twinless = false;
                g.twinless = false;
                goldMentionHeadPositions.get(g.headIndex).remove(g);
                if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
                    goldMentionHeadPositions.remove(g.headIndex);
                }
            } else
                remains.add(p);
        }
        for (Mention r : remains) {
            if (goldMentionHeadPositions.containsKey(r.headIndex)) {
                Mention g = goldMentionHeadPositions.get(r.headIndex).poll();
                r.mentionID = g.mentionID;
                r.twinless = false;
                g.twinless = false;
                if (goldMentionHeadPositions.get(g.headIndex).isEmpty()) {
                    goldMentionHeadPositions.remove(g.headIndex);
                }
            }
        }
    }
}

From source file:knu.univ.lingvo.coref.Document.java

License:Open Source License

/** Extract gold coref cluster information. */
public void extractGoldCorefClusters() {
    goldCorefClusters = Generics.newHashMap();
    for (List<Mention> mentions : goldOrderedMentionsBySentence) {
        for (Mention m : mentions) {
            int id = m.goldCorefClusterID;
            if (id == -1) {
                throw new RuntimeException("No gold info");
            }//from   www  .j  av  a2  s  . com
            CorefCluster c = goldCorefClusters.get(id);
            if (c == null) {
                c = new CorefCluster(id);
                goldCorefClusters.put(id, c);
            }
            c.corefMentions.add(m);
        }
    }
}

From source file:knu.univ.lingvo.coref.Document.java

License:Open Source License

/** Extract gold coref link information */
protected void extractGoldLinks() {
    //    List<List<Mention>> orderedMentionsBySentence = this.getOrderedMentions();
    List<Pair<IntTuple, IntTuple>> links = new ArrayList<Pair<IntTuple, IntTuple>>();

    // position of each mention in the input matrix, by id
    Map<Integer, IntTuple> positions = Generics.newHashMap();
    // positions of antecedents
    Map<Integer, List<IntTuple>> antecedents = Generics.newHashMap();
    for (int i = 0; i < goldOrderedMentionsBySentence.size(); i++) {
        for (int j = 0; j < goldOrderedMentionsBySentence.get(i).size(); j++) {
            Mention m = goldOrderedMentionsBySentence.get(i).get(j);
            int id = m.mentionID;
            IntTuple pos = new IntTuple(2);
            pos.set(0, i);// w w w .ja  va  2  s. c  o  m
            pos.set(1, j);
            positions.put(id, pos);
            antecedents.put(id, new ArrayList<IntTuple>());
        }
    }

    //    SieveCoreferenceSystem.debugPrintMentions(System.err, "", goldOrderedMentionsBySentence);
    for (List<Mention> mentions : goldOrderedMentionsBySentence) {
        for (Mention m : mentions) {
            int id = m.mentionID;
            IntTuple src = positions.get(id);

            assert (src != null);
            if (m.originalRef >= 0) {
                IntTuple dst = positions.get(m.originalRef);
                if (dst == null) {
                    throw new RuntimeException("Cannot find gold mention with ID=" + m.originalRef);
                }

                // to deal with cataphoric annotation
                while (dst.get(0) > src.get(0) || (dst.get(0) == src.get(0) && dst.get(1) > src.get(1))) {
                    Mention dstMention = goldOrderedMentionsBySentence.get(dst.get(0)).get(dst.get(1));
                    m.originalRef = dstMention.originalRef;
                    dstMention.originalRef = id;

                    if (m.originalRef < 0)
                        break;
                    dst = positions.get(m.originalRef);
                }
                if (m.originalRef < 0)
                    continue;

                // A B C: if A<-B, A<-C => make a link B<-C
                for (int k = dst.get(0); k <= src.get(0); k++) {
                    for (int l = 0; l < goldOrderedMentionsBySentence.get(k).size(); l++) {
                        if (k == dst.get(0) && l < dst.get(1))
                            continue;
                        if (k == src.get(0) && l > src.get(1))
                            break;
                        IntTuple missed = new IntTuple(2);
                        missed.set(0, k);
                        missed.set(1, l);
                        if (links.contains(new Pair<IntTuple, IntTuple>(missed, dst))) {
                            antecedents.get(id).add(missed);
                            links.add(new Pair<IntTuple, IntTuple>(src, missed));
                        }
                    }
                }

                links.add(new Pair<IntTuple, IntTuple>(src, dst));

                assert (antecedents.get(id) != null);
                antecedents.get(id).add(dst);

                List<IntTuple> ants = antecedents.get(m.originalRef);
                assert (ants != null);
                for (IntTuple ant : ants) {
                    antecedents.get(id).add(ant);
                    links.add(new Pair<IntTuple, IntTuple>(src, ant));
                }
            }
        }
    }
    goldLinks = links;
}