List of usage examples for edu.stanford.nlp.ling CoreLabel getString
@Override public <KEY extends Key<String>> String getString(Class<KEY> key)
From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java
License:Open Source License
@Override public Document nextDoc() throws Exception { List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>(); List<Tree> allTrees = new ArrayList<Tree>(); List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>(); List<List<Mention>> allPredictedMentions; List<CoreMap> allSentences = new ArrayList<CoreMap>(); Annotation docAnno = new Annotation(""); Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docMatcher = docPattern.matcher(fileContents); if (!docMatcher.find(currentOffset)) return null; currentOffset = docMatcher.end();/* www . j a v a 2 s. c om*/ String doc = docMatcher.group(1); Matcher sentenceMatcher = sentencePattern.matcher(doc); String ner = null; //Maintain current document ID. Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docIDMatcher = docIDPattern.matcher(doc); if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1); else currentDocumentID = "documentAfter " + currentDocumentID; while (sentenceMatcher.find()) { String sentenceString = sentenceMatcher.group(2); List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString), "invertible") .tokenize(); // FIXING TOKENIZATION PROBLEMS for (int i = 0; i < words.size(); i++) { CoreLabel w = words.get(i); if (i > 0 && w.word().equals("$")) { if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP")) continue; words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$"); words.remove(i); i--; } else if (w.word().equals("\\/")) { if (words.get(i - 1).word().equals("</COREF>")) continue; w.set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "\\/" + words.get(i + 1).word()); words.remove(i + 1); words.remove(i - 1); } } // END FIXING TOKENIZATION PROBLEMS List<CoreLabel> sentence = new ArrayList<CoreLabel>(); // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open Stack<Mention> stack = new Stack<Mention>(); List<Mention> mentions = new ArrayList<Mention>(); allWords.add(sentence); allGoldMentions.add(mentions); for (CoreLabel word : words) { String w = word.get(CoreAnnotations.TextAnnotation.class); // found regular token: WORD/POS if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) { int i = w.lastIndexOf("\\/"); String w1 = w.substring(0, i); // we do NOT set POS info here. We take the POS tags from the parser! word.set(CoreAnnotations.TextAnnotation.class, w1); word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } // found the start SGML tag for a NE, e.g., "<ORGANIZATION>" else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) { Pattern nerPattern = Pattern.compile("<(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); ner = m.group(1); } // found the end SGML tag for a NE, e.g., "</ORGANIZATION>" else if (w.startsWith("</") && !w.startsWith("</COREF")) { Pattern nerPattern = Pattern.compile("</(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); String ner1 = m.group(1); if (ner != null && !ner.equals(ner1)) throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); ner = null; } // found the start SGML tag for a coref mention else if (w.startsWith("<COREF")) { Mention mention = new Mention(); // position of this mention in the sentence mention.startIndex = sentence.size(); // extract GOLD info about this coref chain. needed for eval Pattern idPattern = Pattern.compile("ID=\"(.*?)\""); Pattern refPattern = Pattern.compile("REF=\"(.*?)\""); Matcher m = idPattern.matcher(w); m.find(); mention.mentionID = Integer.valueOf(m.group(1)); m = refPattern.matcher(w); if (m.find()) { mention.originalRef = Integer.valueOf(m.group(1)); } // open mention. keep track of all open mentions using the stack stack.push(mention); } // found the end SGML tag for a coref mention else if (w.equals("</COREF>")) { Mention mention = stack.pop(); mention.endIndex = sentence.size(); // this is a closed mention. add it to the final list of mentions // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef); mentions.add(mention); } else { word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } } StringBuilder textContent = new StringBuilder(); for (int i = 0; i < sentence.size(); i++) { CoreLabel w = sentence.get(i); w.set(CoreAnnotations.IndexAnnotation.class, i + 1); w.set(CoreAnnotations.UtteranceAnnotation.class, 0); if (i > 0) textContent.append(" "); textContent.append(w.getString(CoreAnnotations.TextAnnotation.class)); } CoreMap sentCoreMap = new Annotation(textContent.toString()); allSentences.add(sentCoreMap); sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence); } // assign goldCorefClusterID Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { idMention.put(m.mentionID, m); } } for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { if (m.goldCorefClusterID == -1) { if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID; else { int ref = m.originalRef; while (true) { Mention m2 = idMention.get(ref); if (m2.goldCorefClusterID != -1) { m.goldCorefClusterID = m2.goldCorefClusterID; break; } else if (m2.originalRef == -1) { m2.goldCorefClusterID = m2.mentionID; m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { ref = m2.originalRef; } } } } } } docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences); stanfordProcessor.annotate(docAnno); if (allSentences.size() != allWords.size()) throw new IllegalStateException("allSentences != allWords"); for (int i = 0; i < allSentences.size(); i++) { List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> unannotatedSent = allWords.get(i); List<Mention> mentionInSent = allGoldMentions.get(i); for (Mention m : mentionInSent) { m.dependency = allSentences.get(i) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); } if (annotatedSent.size() != unannotatedSent.size()) { throw new IllegalStateException("annotatedSent != unannotatedSent"); } for (int j = 0, sz = annotatedSent.size(); j < sz; j++) { CoreLabel annotatedWord = annotatedSent.get(j); CoreLabel unannotatedWord = unannotatedSent.get(j); if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class) .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) { throw new IllegalStateException("annotatedWord != unannotatedWord"); } } allWords.set(i, annotatedSent); allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class)); } // term things List<List<Mention>> termMentions = new ArrayList<List<Mention>>(); if (use_term) { String dataCrf = ""; System.err.print("FEAT TYPE: "); System.err .println(props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP)); if (props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP) .equals(MyConstants.TTE_FEATURE_NLTK)) { dataCrf = NltkCrfFormatter.annotationToCrfString(docAnno); } else { dataCrf = CrfFormatter.annotationToCrfString(docAnno); } List<List<String>> tagResult = new ArrayList<List<String>>(); try { tagResult = CrfsuiteCaller.tag(dataCrf, props.getProperty(MyConstants.TTE_MODEL)); if (props.containsKey(MyConstants.TTE_SAVE_CRF_DATA)) { String crfDataFilename = props.getProperty(MyConstants.TTE_SAVE_CRF_DATA); File crfDataFile = new File(crfDataFilename); BufferedWriter bw = new BufferedWriter(new FileWriter(crfDataFile)); bw.write(dataCrf); bw.close(); } } catch (Exception e) { System.err.println("Crfsuite tag failed"); } termAsMentionFinder.setTags(tagResult); termMentions = termAsMentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); maxID = termAsMentionFinder.getMaxID(); } // extract predicted mentions allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); if (use_term && props.containsKey(MyConstants.TTE_KEEP_PRON)) { termMentions = injectPronoun(termMentions, allPredictedMentions); } if (experimentType != null) { if (experimentType.equals(MyConstants.EXP_TYPE_03_UNION)) { List<List<Mention>> usingMentions = unionMentions(allPredictedMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (experimentType.equals(MyConstants.EXP_TYPE_03_INTERSECT)) { List<List<Mention>> usingMentions = intersectMentions(allPredictedMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_CHECK)) { allPredictedMentions = termMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_SUPER)) { List<List<Mention>> usingMentions = superstringMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_OVERLAP)) { List<List<Mention>> usingMentions = overlapMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_UNION)) { List<List<Mention>> usingMentions = unionMentions(termMentions, allPredictedMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_SUPER)) { List<List<Mention>> usingMentions = superstringMentions(termMentions, allGoldMentions); allPredictedMentions = usingMentions; } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_OVERLAP)) { List<List<Mention>> usingMentions = overlapMentions(termMentions, allGoldMentions); allPredictedMentions = usingMentions; } else { System.err.println(experimentType); System.err.println("Unknown experiment type. Using mention detector."); } } else if (useGoldMention) { allPredictedMentions = allGoldMentions; } // add the relevant fields to mentions and order them for coref return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); }
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
protected Collection<String> featuresC(PaddedList<IN> cInfo, int loc) { CoreLabel p3 = cInfo.get(loc - 3);/*w ww . j a v a 2s . c om*/ CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p = cInfo.get(loc - 1); CoreLabel c = cInfo.get(loc); CoreLabel n = cInfo.get(loc + 1); CoreLabel n2 = cInfo.get(loc + 2); String cWord = getWord(c); String pWord = getWord(p); String nWord = getWord(n); String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class); String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class); String nShape = n.getString(CoreAnnotations.ShapeAnnotation.class); Collection<String> featuresC = new ArrayList<String>(); if (flags.useDistSim) { distSimAnnotate(cInfo); } if (flags.useBagOfWords) { for (IN word : cInfo) { featuresC.add(getWord(word) + "-BAGOFWORDS"); } } if (flags.useDistSim && flags.useMoreTags) { featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + cWord + "-PDISTSIM-CWORD"); } if (flags.useDistSim) { featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM"); } if (flags.useTitle) { Matcher m = titlePattern.matcher(cWord); if (m.matches()) { featuresC.add("IS_TITLE"); } } if (flags.useInternal && flags.useExternal) { if (flags.useWord) { featuresC.add(cWord + "-WORD"); } if (flags.use2W) { featuresC.add(getWord(p2) + "-P2W"); featuresC.add(getWord(n2) + "-N2W"); } if (flags.useLC) { featuresC.add(cWord.toLowerCase() + "-CL"); featuresC.add(pWord.toLowerCase() + "-PL"); featuresC.add(nWord.toLowerCase() + "-NL"); } if (flags.useUnknown) { // for true casing featuresC.add(c.get(CoreAnnotations.UnknownAnnotation.class) + "-UNKNOWN"); featuresC.add(p.get(CoreAnnotations.UnknownAnnotation.class) + "-PUNKNOWN"); featuresC.add(n.get(CoreAnnotations.UnknownAnnotation.class) + "-NUNKNOWN"); } if (flags.useLemmas) { String lem = c.getString(CoreAnnotations.LemmaAnnotation.class); if (!"".equals(lem)) { featuresC.add(lem + "-LEM"); } } if (flags.usePrevNextLemmas) { String plem = p.getString(CoreAnnotations.LemmaAnnotation.class); String nlem = n.getString(CoreAnnotations.LemmaAnnotation.class); if (!"".equals(plem)) { featuresC.add(plem + "-PLEM"); } if (!"".equals(nlem)) { featuresC.add(nlem + "-NLEM"); } } if (flags.checkNameList) { try { if (lastNames == null) { lastNames = Generics.newHashSet(); for (String line : ObjectBank.getLineIterator(flags.lastNameList)) { String[] cols = line.split("\\s+"); lastNames.add(cols[0]); } } if (maleNames == null) { maleNames = Generics.newHashSet(); for (String line : ObjectBank.getLineIterator(flags.maleNameList)) { String[] cols = line.split("\\s+"); maleNames.add(cols[0]); } } if (femaleNames == null) { femaleNames = Generics.newHashSet(); for (String line : ObjectBank.getLineIterator(flags.femaleNameList)) { String[] cols = line.split("\\s+"); femaleNames.add(cols[0]); } } String name = cWord.toUpperCase(); if (lastNames.contains(name)) { featuresC.add("LAST_NAME"); } if (maleNames.contains(name)) { featuresC.add("MALE_NAME"); } if (femaleNames.contains(name)) { featuresC.add("FEMALE_NAME"); } } catch (Exception e) { throw new RuntimeException(e); } } if (flags.binnedLengths != null) { int len = cWord.length(); String featureName = null; for (int i = 0; i <= flags.binnedLengths.length; i++) { if (i == flags.binnedLengths.length) { featureName = "Len-" + flags.binnedLengths[flags.binnedLengths.length - 1] + "-Inf"; } else if (len <= flags.binnedLengths[i]) { featureName = "Len-" + ((i == 0) ? 1 : flags.binnedLengths[i - 1]) + '-' + flags.binnedLengths[i]; break; } } featuresC.add(featureName); } if (flags.useABGENE) { featuresC.add(c.get(CoreAnnotations.AbgeneAnnotation.class) + "-ABGENE"); featuresC.add(p.get(CoreAnnotations.AbgeneAnnotation.class) + "-PABGENE"); featuresC.add(n.get(CoreAnnotations.AbgeneAnnotation.class) + "-NABGENE"); } if (flags.useABSTRFreqDict) { featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); } if (flags.useABSTR) { featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT"); featuresC.add(p.get(CoreAnnotations.AbstrAnnotation.class) + "-PABSTRACT"); featuresC.add(n.get(CoreAnnotations.AbstrAnnotation.class) + "-NABSTRACT"); } if (flags.useGENIA) { featuresC.add(c.get(CoreAnnotations.GeniaAnnotation.class) + "-GENIA"); featuresC.add(p.get(CoreAnnotations.GeniaAnnotation.class) + "-PGENIA"); featuresC.add(n.get(CoreAnnotations.GeniaAnnotation.class) + "-NGENIA"); } if (flags.useWEBFreqDict) { featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); } if (flags.useWEB) { featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB"); featuresC.add(p.get(CoreAnnotations.WebAnnotation.class) + "-PWEB"); featuresC.add(n.get(CoreAnnotations.WebAnnotation.class) + "-NWEB"); } if (flags.useIsURL) { featuresC.add(c.get(CoreAnnotations.IsURLAnnotation.class) + "-ISURL"); } if (flags.useEntityRule) { featuresC.add(c.get(CoreAnnotations.EntityRuleAnnotation.class) + "-ENTITYRULE"); } if (flags.useEntityTypes) { featuresC.add(c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ENTITYTYPE"); } if (flags.useIsDateRange) { featuresC.add(c.get(CoreAnnotations.IsDateRangeAnnotation.class) + "-ISDATERANGE"); } if (flags.useABSTRFreq) { featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"); } if (flags.useFREQ) { featuresC.add(c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"); } if (flags.useMoreTags) { featuresC.add( p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + cWord + "-PTAG-CWORD"); } if (flags.usePosition) { featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + "-POSITION"); } if (flags.useBeginSent) { String pos = c.get(CoreAnnotations.PositionAnnotation.class); if ("0".equals(pos)) { featuresC.add("BEGIN-SENT"); featuresC.add(cShape + "-BEGIN-SENT"); } else if (Integer.toString(cInfo.size() - 1).equals(pos)) { featuresC.add("END-SENT"); featuresC.add(cShape + "-END-SENT"); } else { featuresC.add("IN-SENT"); featuresC.add(cShape + "-IN-SENT"); } } if (flags.useTags) { featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); } if (flags.useOrdinal) { if (isOrdinal(cInfo, loc)) { featuresC.add("C_ORDINAL"); if (isOrdinal(cInfo, loc - 1)) { //System.err.print(getWord(p) + " "); featuresC.add("PC_ORDINAL"); } //System.err.println(cWord); } if (isOrdinal(cInfo, loc - 1)) { featuresC.add("P_ORDINAL"); } } if (flags.usePrev) { featuresC.add(pWord + "-PW"); if (flags.useTags) { featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PTAG"); } if (flags.useDistSim) { featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + "-PDISTSIM"); } if (flags.useIsURL) { featuresC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + "-PISURL"); } if (flags.useEntityTypes) { featuresC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + "-PENTITYTYPE"); } } if (flags.useNext) { featuresC.add(nWord + "-NW"); if (flags.useTags) { featuresC.add(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-NTAG"); } if (flags.useDistSim) { featuresC.add(n.get(CoreAnnotations.DistSimAnnotation.class) + "-NDISTSIM"); } if (flags.useIsURL) { featuresC.add(n.get(CoreAnnotations.IsURLAnnotation.class) + "-NISURL"); } if (flags.useEntityTypes) { featuresC.add(n.get(CoreAnnotations.EntityTypeAnnotation.class) + "-NENTITYTYPE"); } } /*here, entityTypes refers to the type in the PASCAL IE challenge: * i.e. certain words are tagged "Date" or "Location" */ if (flags.useEitherSideWord) { featuresC.add(pWord + "-EW"); featuresC.add(nWord + "-EW"); } if (flags.useWordPairs) { featuresC.add(cWord + '-' + pWord + "-W-PW"); featuresC.add(cWord + '-' + nWord + "-W-NW"); } if (flags.useSymTags) { if (flags.useTags) { featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCNTAGS"); featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-CNTAGS"); featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCTAGS"); } if (flags.useDistSim) { featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + n.get(CoreAnnotations.DistSimAnnotation.class) + "-PCNDISTSIM"); featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + n.get(CoreAnnotations.DistSimAnnotation.class) + "-CNDISTSIM"); featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-PCDISTSIM"); } } if (flags.useSymWordPairs) { featuresC.add(pWord + '-' + nWord + "-SWORDS"); } String pGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? p.get(CoreAnnotations.GazAnnotation.class) : null; String nGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? n.get(CoreAnnotations.GazAnnotation.class) : null; String cGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? c.get(CoreAnnotations.GazAnnotation.class) : null; if (flags.useGazFeatures) { if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) { featuresC.add(cGazAnnotation + "-GAZ"); } // n if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) { featuresC.add(nGazAnnotation + "-NGAZ"); } // p if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) { featuresC.add(pGazAnnotation + "-PGAZ"); } } if (flags.useMoreGazFeatures) { if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) { featuresC.add(cGazAnnotation + '-' + cWord + "-CG-CW-GAZ"); // c-n if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) { featuresC.add(cGazAnnotation + '-' + nGazAnnotation + "-CNGAZ"); } // p-c if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) { featuresC.add(pGazAnnotation + '-' + cGazAnnotation + "-PCGAZ"); } } } if (flags.useAbbr || flags.useMinimalAbbr) { featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR"); } if (flags.useAbbr1 || flags.useMinimalAbbr1) { if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR"); } } if (flags.useAbbr) { featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR"); featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR"); featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR"); } if (flags.useAbbr1) { if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR"); featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR"); featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR"); } } if (flags.useChunks) { featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + "-PCCHUNK"); featuresC.add(c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-CNCHUNK"); featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK"); } if (flags.useMinimalAbbr) { featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB"); } if (flags.useMinimalAbbr1) { if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB"); } } String prevVB = "", nextVB = ""; if (flags.usePrevVB) { for (int j = loc - 1;; j--) { CoreLabel wi = cInfo.get(j); if (wi == cInfo.getPad()) { prevVB = "X"; featuresC.add("X-PVB"); break; } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) { featuresC.add(getWord(wi) + "-PVB"); prevVB = getWord(wi); break; } } } if (flags.useNextVB) { for (int j = loc + 1;; j++) { CoreLabel wi = cInfo.get(j); if (wi == cInfo.getPad()) { featuresC.add("X-NVB"); nextVB = "X"; break; } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) { featuresC.add(getWord(wi) + "-NVB"); nextVB = getWord(wi); break; } } } if (flags.useVB) { featuresC.add(prevVB + '-' + nextVB + "-PNVB"); } if (flags.useShapeConjunctions) { featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + cShape + "-POS-SH"); if (flags.useTags) { featuresC.add(c.tag() + cShape + "-TAG-SH"); } if (flags.useDistSim) { featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + cShape + "-DISTSIM-SH"); } } if (flags.useWordTag) { featuresC.add(cWord + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-T"); featuresC.add(cWord + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-PT"); featuresC.add(cWord + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-NT"); } if (flags.useNPHead) { featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-HW"); if (flags.useTags) { featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-HW-T"); } if (flags.useDistSim) { featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.get(CoreAnnotations.DistSimAnnotation.class) + "-HW-DISTSIM"); } } if (flags.useNPGovernor) { featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + "-GW"); if (flags.useTags) { featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-GW-T"); } if (flags.useDistSim) { featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM-T1"); } } if (flags.useHeadGov) { featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.get(CoreAnnotations.GovernorAnnotation.class) + "-HW_GW"); } if (flags.useClassFeature) { featuresC.add("###"); } if (flags.useFirstWord) { String firstWord = getWord(cInfo.get(0)); featuresC.add(firstWord); } if (flags.useNGrams) { Collection<String> subs = null; if (flags.cacheNGrams) { subs = wordToSubstrings.get(cWord); } if (subs == null) { subs = new ArrayList<String>(); String word = '<' + cWord + '>'; if (flags.lowercaseNGrams) { word = word.toLowerCase(); } if (flags.dehyphenateNGrams) { word = dehyphenate(word); } if (flags.greekifyNGrams) { word = greekify(word); } // minimum length substring is 2 letters (hardwired) // hoist flags.noMidNGrams so only linear in word length for that case if (flags.noMidNGrams) { int max = flags.maxNGramLeng >= 0 ? Math.min(flags.maxNGramLeng, word.length()) : word.length(); for (int j = 2; j <= max; j++) { subs.add(intern('#' + word.substring(0, j) + '#')); } int start = flags.maxNGramLeng >= 0 ? Math.max(0, word.length() - flags.maxNGramLeng) : 0; int lenM1 = word.length() - 1; for (int i = start; i < lenM1; i++) { subs.add(intern('#' + word.substring(i) + '#')); } } else { for (int i = 0; i < word.length(); i++) { for (int j = i + 2, max = Math.min(word.length(), i + flags.maxNGramLeng); j <= max; j++) { if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) { continue; } subs.add(intern('#' + word.substring(i, j) + '#')); } } } if (flags.cacheNGrams) { wordToSubstrings.put(cWord, subs); } } featuresC.addAll(subs); if (flags.conjoinShapeNGrams) { for (String str : subs) { String feat = str + '-' + cShape + "-CNGram-CS"; featuresC.add(feat); } } } if (flags.useGazettes) { if (flags.sloppyGazette) { Collection<String> entries = wordToGazetteEntries.get(cWord); if (entries != null) { featuresC.addAll(entries); } } if (flags.cleanGazette) { Collection<GazetteInfo> infos = wordToGazetteInfos.get(cWord); if (infos != null) { for (GazetteInfo gInfo : infos) { boolean ok = true; for (int gLoc = 0; gLoc < gInfo.words.length; gLoc++) { ok &= gInfo.words[gLoc].equals(getWord(cInfo.get(loc + gLoc - gInfo.loc))); } if (ok) { featuresC.add(gInfo.feature); } } } } } if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) { featuresC.add(cShape + "-TYPE"); if (flags.useTypeSeqs) { featuresC.add(pShape + "-PTYPE"); featuresC.add(nShape + "-NTYPE"); featuresC.add(pWord + "..." + cShape + "-PW_CTYPE"); featuresC.add(cShape + "..." + nWord + "-NW_CTYPE"); featuresC.add(pShape + "..." + cShape + "-PCTYPE"); featuresC.add(cShape + "..." + nShape + "-CNTYPE"); featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE"); } } if (flags.useLastRealWord) { if (pWord.length() <= 3) { // extending this to check for 2 short words doesn't seem to help.... featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE"); } } if (flags.useNextRealWord) { if (nWord.length() <= 3) { // extending this to check for 2 short words doesn't seem to help.... featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE"); } } if (flags.useOccurrencePatterns) { featuresC.addAll(occurrencePatterns(cInfo, loc)); } if (flags.useDisjunctive) { for (int i = 1; i <= flags.disjunctionWidth; i++) { CoreLabel dn = cInfo.get(loc + i); CoreLabel dp = cInfo.get(loc - i); featuresC.add(getWord(dn) + "-DISJN"); if (flags.useDisjunctiveShapeInteraction) { featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS"); } featuresC.add(getWord(dp) + "-DISJP"); if (flags.useDisjunctiveShapeInteraction) { featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS"); } } } if (flags.useWideDisjunctive) { for (int i = 1; i <= flags.wideDisjunctionWidth; i++) { featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN"); featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP"); } } if (flags.useEitherSideDisjunctive) { for (int i = 1; i <= flags.disjunctionWidth; i++) { featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWE"); featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWE"); } } if (flags.useDisjShape) { for (int i = 1; i <= flags.disjunctionWidth; i++) { featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE"); // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE"); featuresC.add(cShape + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE"); // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE"); } } if (flags.useExtraTaggySequences) { if (flags.useTags) { featuresC.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS"); featuresC.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS"); } if (flags.useDistSim) { featuresC.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1"); featuresC.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1"); } } if (flags.useMUCFeatures) { featuresC.add(c.get(CoreAnnotations.SectionAnnotation.class) + "-SECTION"); featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class) + "-WORD_POSITION"); featuresC.add(c.get(CoreAnnotations.SentencePositionAnnotation.class) + "-SENT_POSITION"); featuresC.add(c.get(CoreAnnotations.ParaPositionAnnotation.class) + "-PARA_POSITION"); featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-WORD_POSITION_SHAPE"); } } else if (flags.useInternal) { if (flags.useWord) { featuresC.add(cWord + "-WORD"); } if (flags.useNGrams) { Collection<String> subs = wordToSubstrings.get(cWord); if (subs == null) { subs = new ArrayList<String>(); String word = '<' + cWord + '>'; if (flags.lowercaseNGrams) { word = word.toLowerCase(); } if (flags.dehyphenateNGrams) { word = dehyphenate(word); } if (flags.greekifyNGrams) { word = greekify(word); } for (int i = 0; i < word.length(); i++) { for (int j = i + 2; j <= word.length(); j++) { if (flags.noMidNGrams && i != 0 && j != word.length()) { continue; } if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) { continue; } //subs.add(intern("#" + word.substring(i, j) + "#")); subs.add(intern('#' + word.substring(i, j) + '#')); } } if (flags.cacheNGrams) { wordToSubstrings.put(cWord, subs); } } featuresC.addAll(subs); if (flags.conjoinShapeNGrams) { String shape = c.get(CoreAnnotations.ShapeAnnotation.class); for (String str : subs) { String feat = str + '-' + shape + "-CNGram-CS"; featuresC.add(feat); } } } if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) { featuresC.add(cShape + "-TYPE"); } if (flags.useOccurrencePatterns) { featuresC.addAll(occurrencePatterns(cInfo, loc)); } } else if (flags.useExternal) { if (flags.usePrev) { featuresC.add(pWord + "-PW"); } if (flags.useNext) { featuresC.add(nWord + "-NW"); } if (flags.useWordPairs) { featuresC.add(cWord + '-' + pWord + "-W-PW"); featuresC.add(cWord + '-' + nWord + "-W-NW"); } if (flags.useSymWordPairs) { featuresC.add(pWord + '-' + nWord + "-SWORDS"); } if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) { if (flags.useTypeSeqs) { featuresC.add(pShape + "-PTYPE"); featuresC.add(nShape + "-NTYPE"); featuresC.add(pWord + "..." + cShape + "-PW_CTYPE"); featuresC.add(cShape + "..." + nWord + "-NW_CTYPE"); if (flags.maxLeft > 0) featuresC.add(pShape + "..." + cShape + "-PCTYPE"); // this one just isn't useful, at least given c,pc,s,ps. Might be useful 0th-order featuresC.add(cShape + "..." + nShape + "-CNTYPE"); featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE"); } } if (flags.useLastRealWord) { if (pWord.length() <= 3) { featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE"); } } if (flags.useNextRealWord) { if (nWord.length() <= 3) { featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE"); } } if (flags.useDisjunctive) { for (int i = 1; i <= flags.disjunctionWidth; i++) { CoreLabel dn = cInfo.get(loc + i); CoreLabel dp = cInfo.get(loc - i); featuresC.add(getWord(dn) + "-DISJN"); if (flags.useDisjunctiveShapeInteraction) { featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS"); } featuresC.add(getWord(dp) + "-DISJP"); if (flags.useDisjunctiveShapeInteraction) { featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS"); } } } if (flags.useWideDisjunctive) { for (int i = 1; i <= flags.wideDisjunctionWidth; i++) { featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN"); featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP"); } } if (flags.useDisjShape) { for (int i = 1; i <= flags.disjunctionWidth; i++) { featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE"); // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE"); featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE"); // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE"); } } } // Stuff to add binary features from the additional columns if (flags.twoStage) { featuresC.add(c.get(Bin1Annotation.class) + "-BIN1"); featuresC.add(c.get(Bin2Annotation.class) + "-BIN2"); featuresC.add(c.get(Bin3Annotation.class) + "-BIN3"); featuresC.add(c.get(Bin4Annotation.class) + "-BIN4"); featuresC.add(c.get(Bin5Annotation.class) + "-BIN5"); featuresC.add(c.get(Bin6Annotation.class) + "-BIN6"); } if (flags.useIfInteger) { try { int val = Integer.parseInt(cWord); if (val > 0) featuresC.add("POSITIVE_INTEGER"); else if (val < 0) featuresC.add("NEGATIVE_INTEGER"); // System.err.println("FOUND INTEGER"); } catch (NumberFormatException e) { // not an integer value, nothing to do } } //Stuff to add arbitrary features if (flags.useGenericFeatures) { //see if we need to cache the keys if (genericAnnotationKeys == null) { makeGenericKeyCache(c); } //now look through the cached keys for (Class key : genericAnnotationKeys) { //System.err.println("Adding feature: " + CoreLabel.genericValues.get(key) + " with value " + c.get(key)); if (c.get(key) != null && c.get(key) instanceof Collection) { for (Object ob : (Collection) c.get(key)) { featuresC.add(ob + "-" + CoreLabel.genericValues.get(key)); } } else { featuresC.add(c.get(key) + "-" + CoreLabel.genericValues.get(key)); } } } if (flags.useTopics) { //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + cWord + "--CWORD"); featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + "-TopicID"); featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + "-PTopicID"); featuresC.add(n.get(CoreAnnotations.TopicAnnotation.class) + "-NTopicID"); //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-PCNTopicID"); //featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-CNTopicID"); //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + "-PCTopicID"); //featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + cShape + "-TopicID-SH"); //asdasd } // NER tag annotations from a previous NER system if (c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) != null) { featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CStackedNERTag"); featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-WCStackedNERTag"); if (flags.useNext) { featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CNStackedNERTag"); featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-WCNStackedNERTag"); if (flags.usePrev) { featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCNStackedNERTag"); featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + cWord + " -" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PWCNStackedNERTag"); } } if (flags.usePrev) { featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCStackedNERTag"); } } if (flags.useWordnetFeatures) featuresC.add(c.get(CoreAnnotations.WordnetSynAnnotation.class) + "-WordnetSyn"); if (flags.useProtoFeatures) featuresC.add(c.get(CoreAnnotations.ProtoAnnotation.class) + "-Proto"); if (flags.usePhraseWordTags) featuresC.add(c.get(CoreAnnotations.PhraseWordsTagAnnotation.class) + "-PhraseTag"); if (flags.usePhraseWords) { for (String w : c.get(CoreAnnotations.PhraseWordsAnnotation.class)) featuresC.add(w + "-PhraseWord"); } if (flags.useCommonWordsFeature) featuresC.add(c.get(CoreAnnotations.CommonWordsAnnotation.class)); if (flags.useRadical && cWord.length() > 0) { if (cWord.length() == 1) { featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + "-SINGLE-CHAR-RADICAL"); } else { featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + "-START-RADICAL"); featuresC.add(RadicalMap.getRadical(cWord.charAt(cWord.length() - 1)) + "-END-RADICAL"); } for (int i = 0; i < cWord.length(); ++i) { featuresC.add(RadicalMap.getRadical(cWord.charAt(i)) + "-RADICAL"); } } if (flags.splitWordRegex != null && !flags.splitWordRegex.isEmpty()) { String[] ws = c.word().split(flags.splitWordRegex); for (String s : ws) { featuresC.add(s + "-SPLITWORD"); } } return featuresC; }
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
protected Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) { CoreLabel p = cInfo.get(loc - 1);//www. j av a 2 s. co m CoreLabel c = cInfo.get(loc); CoreLabel n = cInfo.get(loc + 1); String cWord = getWord(c); String pWord = getWord(p); String cDS = c.getString(CoreAnnotations.DistSimAnnotation.class); String pDS = p.getString(CoreAnnotations.DistSimAnnotation.class); String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class); String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class); Collection<String> featuresCpC = new ArrayList<String>(); if (flags.noEdgeFeature) return featuresCpC; if (flags.transitionEdgeOnly) { featuresCpC.add("PSEQ"); return featuresCpC; } if (flags.useNeighborNGrams) { int maxLen = pWord.length(); if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) { maxLen = flags.maxNGramLeng; } for (int len = 1; len <= maxLen; ++len) { featuresCpC.add(pWord.substring(0, len) + "-PREVIOUS-PREFIX"); } for (int pos = pWord.length() - maxLen; pos < pWord.length(); ++pos) { featuresCpC.add(pWord.substring(pos, pWord.length()) + "-PREVIOUS-SUFFIX"); } maxLen = cWord.length(); if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) { maxLen = flags.maxNGramLeng; } for (int len = 1; len <= maxLen; ++len) { featuresCpC.add(cWord.substring(0, len) + "-CURRENT-PREFIX"); } for (int pos = cWord.length() - maxLen; pos < cWord.length(); ++pos) { featuresCpC.add(cWord.substring(pos, cWord.length()) + "-CURRENT-SUFFIX"); } } if (flags.useInternal && flags.useExternal) { if (flags.useOrdinal) { if (isOrdinal(cInfo, loc)) { featuresCpC.add("C_ORDINAL"); if (isOrdinal(cInfo, loc - 1)) { featuresCpC.add("PC_ORDINAL"); } } if (isOrdinal(cInfo, loc - 1)) { featuresCpC.add("P_ORDINAL"); } } if (flags.useAbbr || flags.useMinimalAbbr) { featuresCpC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PABBRANS"); } if (flags.useAbbr1 || flags.useMinimalAbbr1) { if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { featuresCpC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PABBRANS"); } } if (flags.useChunkySequences) { featuresCpC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK"); } if (flags.usePrev) { if (flags.useSequences && flags.usePrevSequences) { featuresCpC.add("PSEQ"); featuresCpC.add(cWord + "-PSEQW"); /*if ( ! flags.strictGoodCoNLL) { featuresCpC.add(pWord+ '-' +cWord + "-PSEQW2"); // added later after goodCoNLL featuresCpC.add(pWord + "-PSEQpW"); // added later after goodCoNLL } if (flags.useDistSim) { featuresCpC.add(pDS + "-PSEQpDS"); featuresCpC.add(cDS + "-PSEQcDS"); featuresCpC.add(pDS+ '-' +cDS + "-PSEQpcDS"); } if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings)) { if ( ! flags.strictGoodCoNLL) { // These ones were added later after goodCoNLL featuresCpC.add(pShape + "-PSEQpS"); featuresCpC.add(cShape + "-PSEQcS"); } if (flags.strictGoodCoNLL && ! flags.removeStrictGoodCoNLLDuplicates) { featuresCpC.add(pShape + '-' + cShape + "-PSEQpcS"); // Duplicate (in goodCoNLL orig, see -TYPES below) } }*/ } } if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs && (flags.useTypeSeqs2 || flags.useTypeSeqs3)) { if (flags.useTypeSeqs3) { featuresCpC.add(pShape + '-' + cShape + '-' + n.get(CoreAnnotations.ShapeAnnotation.class) + "-PCNSHAPES"); } if (flags.useTypeSeqs2) { featuresCpC.add(pShape + '-' + cShape + "-TYPES"); // this duplicates PSEQpcS above } if (flags.useYetMoreCpCShapes) { String p2Shape = cInfo.get(loc - 2).getString(CoreAnnotations.ShapeAnnotation.class); featuresCpC.add(p2Shape + '-' + pShape + '-' + cShape + "-YMS"); featuresCpC.add(pShape + '-' + cShape + "-" + n.getString(CoreAnnotations.ShapeAnnotation.class) + "-YMSPCN"); } } if (flags.useTypeySequences) { featuresCpC.add(cShape + "-TPS2"); featuresCpC.add(n.get(CoreAnnotations.ShapeAnnotation.class) + "-TNS1"); // featuresCpC.add(pShape) + "-" + cShape) + "-TPS"); // duplicates -TYPES, so now omitted; you may need to slightly increase sigma to duplicate previous results, however. } if (flags.useTaggySequences) { if (flags.useTags) { featuresCpC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TS"); } if (flags.useDistSim) { featuresCpC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TS1"); } } if (flags.useParenMatching) { if (flags.useReverse) { if (cWord.equals("(") || cWord.equals("[") || cWord.equals("-LRB-")) { if (pWord.equals(")") || pWord.equals("]") || pWord.equals("-RRB-")) { featuresCpC.add("PAREN-MATCH"); } } } else { if (cWord.equals(")") || cWord.equals("]") || cWord.equals("-RRB-")) { if (pWord.equals("(") || pWord.equals("[") || pWord.equals("-LRB-")) { featuresCpC.add("PAREN-MATCH"); } } } } if (flags.useEntityTypeSequences) { featuresCpC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + '-' + c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ETSEQ"); } if (flags.useURLSequences) { featuresCpC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + '-' + c.get(CoreAnnotations.IsURLAnnotation.class) + "-URLSEQ"); } } else if (flags.useInternal) { if (flags.useSequences && flags.usePrevSequences) { featuresCpC.add("PSEQ"); featuresCpC.add(cWord + "-PSEQW"); } if (flags.useTypeySequences) { featuresCpC.add(cShape + "-TPS2"); } } else if (flags.useExternal) { if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs && (flags.useTypeSeqs2 || flags.useTypeSeqs3)) { if (flags.useTypeSeqs3) { featuresCpC.add(pShape + '-' + cShape + '-' + n.get(CoreAnnotations.ShapeAnnotation.class) + "-PCNSHAPES"); } if (flags.useTypeSeqs2) { featuresCpC.add(pShape + '-' + cShape + "-TYPES"); } } if (flags.useTypeySequences) { featuresCpC.add(n.get(CoreAnnotations.ShapeAnnotation.class) + "-TNS1"); featuresCpC.add(pShape + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TPS"); } } return featuresCpC; }
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
protected Collection<String> featuresCpCp2C(PaddedList<IN> cInfo, int loc) { CoreLabel c = cInfo.get(loc);//from w w w . j a va 2 s.co m CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); String pWord = getWord(p); // String p2Word = getWord(p2); Collection<String> featuresCpCp2C = new ArrayList<String>(); if (flags.useInternal && flags.useExternal) { /*if (flags.strictGoodCoNLL && ! flags.removeStrictGoodCoNLLDuplicates && flags.useTypeySequences && flags.maxLeft >= 2) { // this feature duplicates -TYPETYPES below, so probably don't include it, but it was in original tests of CMM goodCoNLL featuresCpCp2C.add(p2.get(CoreAnnotations.ShapeAnnotation.class) + '-' + p.get(CoreAnnotations.ShapeAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTPS"); }*/ if (flags.useAbbr) { featuresCpCp2C.add(p2.get(CoreAnnotations.AbbrAnnotation.class) + '-' + p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-2PABBRANS"); } if (flags.useChunks) { featuresCpCp2C.add(p2.get(CoreAnnotations.ChunkAnnotation.class) + '-' + p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + "-2PCHUNKS"); } if (flags.useLongSequences) { featuresCpCp2C.add("PPSEQ"); } if (flags.useBoundarySequences && pWord.equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) { featuresCpCp2C.add("BNDRY-SPAN-PPSEQ"); } // This more complex consistency checker didn't help! // if (flags.useBoundarySequences) { // // try enforce consistency over "and" and "," as well as boundary // if (pWord.equals(CoNLLDocumentIteratorFactory.BOUNDARY) || // pWord.equalsIgnoreCase("and") || pWord.equalsIgnoreCase("or") || // pWord.equals(",")) { // } // } if (flags.useTaggySequences) { if (flags.useTags) { featuresCpCp2C.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS"); if (flags.useTaggySequencesShapeInteraction) { featuresCpCp2C.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTS-CS"); } } if (flags.useDistSim) { featuresCpCp2C.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1"); if (flags.useTaggySequencesShapeInteraction) { featuresCpCp2C.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-DISTSIM_TTS1-CS"); } } } if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs && flags.useTypeSeqs2 && flags.maxLeft >= 2) { String cShape = c.get(CoreAnnotations.ShapeAnnotation.class); String pShape = p.get(CoreAnnotations.ShapeAnnotation.class); String p2Shape = p2.get(CoreAnnotations.ShapeAnnotation.class); featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES"); } } else if (flags.useInternal) { if (flags.useLongSequences) { featuresCpCp2C.add("PPSEQ"); } } else if (flags.useExternal) { if (flags.useLongSequences) { featuresCpCp2C.add("PPSEQ"); } if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs && flags.useTypeSeqs2 && flags.maxLeft >= 2) { String cShape = c.get(CoreAnnotations.ShapeAnnotation.class); String pShape = p.get(CoreAnnotations.ShapeAnnotation.class); String p2Shape = p2.get(CoreAnnotations.ShapeAnnotation.class); featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES"); } } return featuresCpCp2C; }
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
protected Collection<String> featuresCpCp2Cp3C(PaddedList<IN> cInfo, int loc) { CoreLabel c = cInfo.get(loc);/*from w w w . j a v a 2 s .co m*/ CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p3 = cInfo.get(loc - 3); Collection<String> featuresCpCp2Cp3C = new ArrayList<String>(); if (flags.useTaggySequences) { if (flags.useTags) { if (flags.maxLeft >= 3 && !flags.dontExtendTaggy) { featuresCpCp2Cp3C.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS"); if (flags.useTaggySequencesShapeInteraction) { featuresCpCp2Cp3C.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTTS-CS"); } } } if (flags.useDistSim) { if (flags.maxLeft >= 3 && !flags.dontExtendTaggy) { featuresCpCp2Cp3C.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1"); if (flags.useTaggySequencesShapeInteraction) { featuresCpCp2Cp3C.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-DISTSIM_TTTS1-CS"); } } } } if (flags.maxLeft >= 3) { if (flags.useLongSequences) { featuresCpCp2Cp3C.add("PPPSEQ"); } if (flags.useBoundarySequences && getWord(p).equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) { featuresCpCp2Cp3C.add("BNDRY-SPAN-PPPSEQ"); } } return featuresCpCp2Cp3C; }
From source file:knu.univ.lingvo.coref.MUCMentionExtractor.java
License:Open Source License
@Override public Document nextDoc() throws Exception { List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>(); List<Tree> allTrees = new ArrayList<Tree>(); List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>(); List<List<Mention>> allPredictedMentions; List<CoreMap> allSentences = new ArrayList<CoreMap>(); Annotation docAnno = new Annotation(""); Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docMatcher = docPattern.matcher(fileContents); if (!docMatcher.find(currentOffset)) return null; currentOffset = docMatcher.end();// ww w .j av a2 s. c o m String doc = docMatcher.group(1); Matcher sentenceMatcher = sentencePattern.matcher(doc); String ner = null; //Maintain current document ID. Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE); Matcher docIDMatcher = docIDPattern.matcher(doc); if (docIDMatcher.find()) currentDocumentID = docIDMatcher.group(1); else currentDocumentID = "documentAfter " + currentDocumentID; while (sentenceMatcher.find()) { String sentenceString = sentenceMatcher.group(2); List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize(); // FIXING TOKENIZATION PROBLEMS for (int i = 0; i < words.size(); i++) { CoreLabel w = words.get(i); if (i > 0 && w.word().equals("$")) { if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP")) continue; words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$"); words.remove(i); i--; } else if (w.word().equals("\\/")) { if (words.get(i - 1).word().equals("</COREF>")) continue; w.set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "\\/" + words.get(i + 1).word()); words.remove(i + 1); words.remove(i - 1); } } // END FIXING TOKENIZATION PROBLEMS List<CoreLabel> sentence = new ArrayList<CoreLabel>(); // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open Stack<Mention> stack = new Stack<Mention>(); List<Mention> mentions = new ArrayList<Mention>(); allWords.add(sentence); allGoldMentions.add(mentions); for (CoreLabel word : words) { String w = word.get(CoreAnnotations.TextAnnotation.class); // found regular token: WORD/POS if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) { int i = w.lastIndexOf("\\/"); String w1 = w.substring(0, i); // we do NOT set POS info here. We take the POS tags from the parser! word.set(CoreAnnotations.TextAnnotation.class, w1); word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } // found the start SGML tag for a NE, e.g., "<ORGANIZATION>" else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) { Pattern nerPattern = Pattern.compile("<(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); ner = m.group(1); } // found the end SGML tag for a NE, e.g., "</ORGANIZATION>" else if (w.startsWith("</") && !w.startsWith("</COREF")) { Pattern nerPattern = Pattern.compile("</(.*?)>"); Matcher m = nerPattern.matcher(w); m.find(); String ner1 = m.group(1); if (ner != null && !ner.equals(ner1)) throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1); ner = null; } // found the start SGML tag for a coref mention else if (w.startsWith("<COREF")) { Mention mention = new Mention(); // position of this mention in the sentence mention.startIndex = sentence.size(); // extract GOLD info about this coref chain. needed for eval Pattern idPattern = Pattern.compile("ID=\"(.*?)\""); Pattern refPattern = Pattern.compile("REF=\"(.*?)\""); Matcher m = idPattern.matcher(w); m.find(); mention.mentionID = Integer.parseInt(m.group(1)); m = refPattern.matcher(w); if (m.find()) { mention.originalRef = Integer.parseInt(m.group(1)); } // open mention. keep track of all open mentions using the stack stack.push(mention); } // found the end SGML tag for a coref mention else if (w.equals("</COREF>")) { Mention mention = stack.pop(); mention.endIndex = sentence.size(); // this is a closed mention. add it to the final list of mentions // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef); mentions.add(mention); } else { word.remove(CoreAnnotations.OriginalTextAnnotation.class); if (Constants.USE_GOLD_NE) { if (ner != null) { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner); } else { word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O"); } } sentence.add(word); } } StringBuilder textContent = new StringBuilder(); for (int i = 0; i < sentence.size(); i++) { CoreLabel w = sentence.get(i); w.set(CoreAnnotations.IndexAnnotation.class, i + 1); w.set(CoreAnnotations.UtteranceAnnotation.class, 0); if (i > 0) textContent.append(" "); textContent.append(w.getString(CoreAnnotations.TextAnnotation.class)); } CoreMap sentCoreMap = new Annotation(textContent.toString()); allSentences.add(sentCoreMap); sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence); } // assign goldCorefClusterID Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { idMention.put(m.mentionID, m); } } for (List<Mention> goldMentions : allGoldMentions) { for (Mention m : goldMentions) { if (m.goldCorefClusterID == -1) { if (m.originalRef == -1) m.goldCorefClusterID = m.mentionID; else { int ref = m.originalRef; while (true) { Mention m2 = idMention.get(ref); if (m2.goldCorefClusterID != -1) { m.goldCorefClusterID = m2.goldCorefClusterID; break; } else if (m2.originalRef == -1) { m2.goldCorefClusterID = m2.mentionID; m.goldCorefClusterID = m2.goldCorefClusterID; break; } else { ref = m2.originalRef; } } } } } } docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences); stanfordProcessor.annotate(docAnno); if (allSentences.size() != allWords.size()) throw new IllegalStateException("allSentences != allWords"); for (int i = 0; i < allSentences.size(); i++) { List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class); List<CoreLabel> unannotatedSent = allWords.get(i); List<Mention> mentionInSent = allGoldMentions.get(i); for (Mention m : mentionInSent) { m.dependency = allSentences.get(i) .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class); } if (annotatedSent.size() != unannotatedSent.size()) { throw new IllegalStateException("annotatedSent != unannotatedSent"); } for (int j = 0, sz = annotatedSent.size(); j < sz; j++) { CoreLabel annotatedWord = annotatedSent.get(j); CoreLabel unannotatedWord = unannotatedSent.get(j); if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class) .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) { throw new IllegalStateException("annotatedWord != unannotatedWord"); } } allWords.set(i, annotatedSent); allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class)); } // extract predicted mentions if (Constants.USE_GOLD_MENTIONS) allPredictedMentions = allGoldMentions; else allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries); // add the relevant fields to mentions and order them for coref return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true); }
From source file:lv.lumii.expressions.Expression.java
License:Open Source License
public void loadUsingTagger(String phrase, boolean knownLemma, boolean debug) { expWords = new LinkedList<ExpressionWord>(); // risin?jums fr?zm form? "biedrs-dibin?t?js" // FIXME - varbt o visp?rgaj? loct?j? j?ienes? if (phrase.matches("\\p{IsLatin}+-\\p{IsLatin}+") && this.category != Category.hum) phrase = phrase.replace("-", " - "); List<Word> words = Splitting.tokenize(analyzer, phrase); for (Word word : words) { // filtrjam variantus, emot vr? to ko zinam par fr?zi un kategoriju if (debug) { System.out.printf("%s normal analysis:\n", word.getToken()); //word.describe(System.out); for (Wordform wf : word.wordforms) System.out.printf("\t%s\n", wf.getTag()); }// w w w . j a v a2 s .c o m addExtraPossibilities(word, knownLemma, debug); // Pietjnta minana, emot v?rd? named entity patnbas if (debug) { System.out.printf("%s generated alternatives:\n", word.getToken()); for (Wordform wf : word.wordforms) System.out.printf("\t%s\n", wf.getTag()); } } if (category == Category.hum) gender = guessPersonGender(words); if (debug) System.out.printf("Detected gender : %s\n", gender.toString()); for (Word word : words) { // ja fr?zei kopum? ir skaidra dzimte, tad izmetam 'nepareiz?s' dzimtes alternatvas if (category == Category.hum && gender != Gender.unknown) { LinkedList<Wordform> izmetamie = new LinkedList<Wordform>(); for (Wordform wf : word.wordforms) { Gender tempgender = gender; // default option - same as the whole name if (gender == Gender.feminine && wf.getToken().endsWith("kalns")) // Exception for compound masculine words used as female surnames e.g. 'Zaaiskalns' tempgender = Gender.masculine; if ((tempgender == Gender.masculine && wf.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Feminine)) || (tempgender == Gender.feminine && wf.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Masculine))) izmetamie.add(wf); } word.wordforms.removeAll(izmetamie); // TODO - te ne?eko, vai nav izmesti visi visi varianti - teortiski guessPersonGender ?dus gadjumus nepieaus } if (category == Category.hum) { LinkedList<Wordform> izmetamie = new LinkedList<Wordform>(); for (Wordform wf : word.wordforms) { if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective) && wf.isMatchingStrong(AttributeNames.i_Definiteness, AttributeNames.v_Indefinite) && wf.isMatchingStrong(AttributeNames.i_CapitalLetters, AttributeNames.v_FirstUpper)) izmetamie.add(wf); // Problma, ka k?du pav?rdu (piem. Znaroks) tageris nosauc par nenoteikto pabas v?rdu - tas der tikai noteiktajiem! if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adverb)) izmetamie.add(wf); // inflexive -i surnames (Maija Kubli) // Pieemam, ka noteikto pabas v?rdu uzv?rdi (Platais, Lielais utml) var bt tikai no in-vocabulary v?rdiem vai ar ja ir explicitly pateikts ka t? ir pamatforma, p?rjiem j?em k? lietv?rda forma if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective) && wf.isMatchingStrong(AttributeNames.i_Definiteness, AttributeNames.v_Definite) && wf.isMatchingStrong(AttributeNames.i_Guess, AttributeNames.v_Ending) && !knownLemma) izmetamie.add(wf); } word.wordforms.removeAll(izmetamie); if (izmetamie.size() > 0 && word.wordforms.size() == 0) { // Ja is process noveda peie t?, ka izmet?m visus visus variantus... tad j?iesldz minana un j?uzmin tiei lietv?rdi ! Word extra_possibilities = analyzer.guessByEnding(word.getToken().toLowerCase(), word.getToken()); for (Wordform new_wf : extra_possibilities.wordforms) { if ((new_wf.isMatchingWeak(AttributeNames.i_Gender, AttributeNames.v_Masculine) && gender != Gender.feminine) || (new_wf.isMatchingWeak(AttributeNames.i_Gender, AttributeNames.v_Feminine) && gender != Gender.masculine)) { word.addWordform(new_wf); } } } } else { // ja nav category == Category.hum if (category == Category.other && knownLemma && word == words.get(words.size() - 1)) { // nestandarta fr?zm - pieemot, ka t? bs lietv?rda fr?ze - apcrpam alternatvas pdjam v?rdam, lai to nenotago piem k? dsk enitvu LinkedList<Wordform> izmetamie = new LinkedList<Wordform>(); for (Wordform wf : word.wordforms) { if (wf.isMatchingStrong(AttributeNames.i_Case, AttributeNames.v_Genitive)) izmetamie.add(wf); // Problma, ka k?du pav?rdu (piem. Znaroks) tageris nosauc par nenoteikto pabas v?rdu - tas der tikai noteiktajiem! } if (izmetamie.size() < word.wordforms.size()) // ja ir kaut viens dergs word.wordforms.removeAll(izmetamie); } } // Blacklist of confusing but unlikely lemmas List<String> blacklist = Arrays.asList("vlan?s"); LinkedList<Wordform> izmetamie = new LinkedList<Wordform>(); for (Wordform wf : word.wordforms) { if (blacklist.contains(wf.getValue(AttributeNames.i_Lemma))) { izmetamie.add(wf); } } if (izmetamie.size() < word.wordforms.size()) // ja ir kaut viens dergs word.wordforms.removeAll(izmetamie); } /* if (category == Category.hum && bothGendersPossible) { // FIXME - "Andra Brzia" gadjums, lai neizdom? ka viens no v?rdiem tomr ir sievieu dzimt. // kamr tageris ?dus ne vienmr atrisina, ir is workaround - pieemam, ka ja nu var bt viskautkas, tad tas ir vrieu dzimt; jo re?lajos datos male:female proporcija ir 80:20-95:05. for (Word word: words) { LinkedList<Wordform> izmetamie = new LinkedList<Wordform>(); for (Wordform wf : word.wordforms) { if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Noun) && wf.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Feminine)) izmetamie.add(wf); } if (izmetamie.size() < word.wordforms.size()) // ja ir kaut viens dergs word.wordforms.removeAll(izmetamie); } } */ if (debug) for (Word word : words) { System.out.printf("%s alternatives given to tagger:\n", word.getToken()); for (Wordform wf : word.wordforms) System.out.printf("\t%s\n", wf.getTag()); } List<CoreLabel> sentence = LVMorphologyReaderAndWriter.analyzeSentence2(words); sentence = morphoClassifier.classify(sentence); //TODO - tageris ir uztrents uz pilniem teikumiem, nevis ?d?m fr?zm. Ja izveidotu pai piel?gotu tagera modeli, tad tas vartu bt daudz precz?ks. String token; Word analysis; Wordform maxwf; for (CoreLabel label : sentence) { token = label.getString(TextAnnotation.class); if (token.equals("<s>")) { // Tageris skat?s uz v?rda apkaimi; teikuma s?kuma/beigu v?rdi ir pai, to signaliz pieliekot s?kum?/beig?s <s> continue; } analysis = label.get(LVMorphologyAnalysis.class); maxwf = analysis.getMatchingWordform(label.getString(AnswerAnnotation.class), false); if (maxwf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb)) { // Ms varam pieemt ka enttijas ir 'nounphrase' un ja beig?s ir verbs (nevis divdabis) tad tas ir tagera guks (piemrs 'DPS saraksta') // ^^ FIXME - a k?pc tad te ?eko *visiem* v?rdiem nevis tikai pdjam? for (Wordform wf : analysis.wordforms) { if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Noun)) maxwf = wf; // TODO - varbt var mazliet gudr?k, ja ir vair?ki kas atbilst tagera datiem tad emt ticam?ko } } if (debug) System.out.printf("%s chosen : %s\n", maxwf.getToken(), maxwf.getTag()); ExpressionWord tmp = new ExpressionWord(analysis, maxwf); expWords.add(tmp); } if (category == Category.hum && gender == Gender.unknown) { boolean allMale = true; boolean allFemale = true; for (ExpressionWord w : expWords) { if (w.correctWordform.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Masculine)) allFemale = false; if (w.correctWordform.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Feminine)) allMale = false; } if (allMale) gender = Gender.masculine; if (allFemale) gender = Gender.feminine; if (debug) System.out.printf("Final gender : %s\n", gender.toString()); } }
From source file:lv.lumii.morphotagger.MorphoCRF.java
License:Open Source License
private static void testData(AbstractSequenceClassifier<CoreLabel> crf, String filename, DocumentReaderAndWriter<CoreLabel> reader) { try {//from w ww.j a va 2 s. c om PrintWriter izeja = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8")); ObjectBank<List<CoreLabel>> documents = crf.makeObjectBankFromFile(filename, reader); int correct_tag = 0; int correct_lemma = 0; int correct_all = 0; int total = 0; Collection<AttributeValues> errors = new LinkedList<AttributeValues>(); for (List<CoreLabel> document : documents) { List<CoreLabel> out = crf.classify(document); System.out.println("-----"); for (CoreLabel word : out) { String token = word.word(); if (token.contains("<s>") || token.contains("</s>")) continue; String answer = word.get(AnswerAnnotation.class); Word analysis = word.get(LVMorphologyAnalysis.class); Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false); //complain about potential lemma errors String lemma = maxwf.getValue(AttributeNames.i_Lemma); String gold_tag = word.get(GoldAnswerAnnotation.class); String gold_lemma = word.get(LemmaAnnotation.class); // The lemma that's written in the test data AttributeValues gold_tags = MarkupConverter.fromKamolsMarkup(gold_tag); AttributeValues found_tags = MarkupConverter.fromKamolsMarkup(answer); errors.add(compareAVs(gold_tags, found_tags)); total++; if (gold_lemma == null || gold_lemma.equalsIgnoreCase(lemma)) correct_lemma++; else { //System.out.println(String.format("word: %s, tag:%s, gold_lemma: '%s', lemma: '%s'", token, answer, gold_lemma, lemma)); } if (match(gold_tags, found_tags)) { correct_tag++; if (gold_lemma == null) System.out.println("Nav lemmas? " + token); if (gold_lemma != null && gold_lemma.equalsIgnoreCase(lemma)) correct_all++; } else { System.out.println( "v?rds: " + token + ", pareizais: " + gold_tag + ", autom?tiskais: " + answer); //compareAVs(pareizie, atrastie).describe(new PrintWriter(System.out)); } } } izeja.printf("\nEvaluation results:\n"); izeja.printf("\tCorrect tag:\t%4.1f%%\t%d\n", correct_tag * 100.0 / total, total - correct_tag); izeja.printf("\tCorrect lemma:\t%4.1f%%\t%d\n", correct_lemma * 100.0 / total, total - correct_lemma); izeja.printf("\tCorrect all:\t%4.1f%%\t%d\n", correct_all * 100.0 / total, total - correct_all); summarizeErrors(errors, izeja); izeja.flush(); } catch (IOException e) { e.printStackTrace(); } }
From source file:lv.lumii.morphotagger.MorphoPipe.java
License:Open Source License
private static String output_JSON(List<CoreLabel> tokens) { LinkedList<String> tokenJSON = new LinkedList<String>(); for (CoreLabel word : tokens) { String token = word.getString(TextAnnotation.class); if (token.contains("<s>")) continue; Word analysis = word.get(LVMorphologyAnalysis.class); Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false); if (mini_tag) maxwf.removeNonlexicalAttributes(); if (maxwf != null) tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"%s\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(maxwf.getTag()), JSONValue.escape(maxwf.getValue(AttributeNames.i_Lemma)))); else/*from w ww . j a v a2 s . co m*/ tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"-\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(token))); } String s = formatJSON(tokenJSON).toString(); tokens = null; tokenJSON = null; return s; }
From source file:lv.lumii.morphotagger.MorphoPipe.java
License:Open Source License
private static void output_XML(List<CoreLabel> tokens, PrintStream straume) throws IOException { PrintWriter w = new PrintWriter(straume); for (CoreLabel word : tokens) { String token = word.getString(TextAnnotation.class); if (token.contains("<s>")) continue; Word analysis = word.get(LVMorphologyAnalysis.class); Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false); if (mini_tag) maxwf.removeNonlexicalAttributes(); maxwf.addAttribute("Tag", maxwf.getTag()); maxwf.toXML(w);//from w ww. ja v a 2 s. c om // if (maxwf != null) // tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"%s\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(maxwf.getTag()), JSONValue.escape(maxwf.getValue(AttributeNames.i_Lemma)))); // else // tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"-\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(token))); } w.flush(); }