List of usage examples for edu.stanford.nlp.process WordShapeClassifier NOWORDSHAPE
int NOWORDSHAPE
To view the source code for edu.stanford.nlp.process WordShapeClassifier NOWORDSHAPE.
Click Source Link
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
protected Collection<String> featuresC(PaddedList<IN> cInfo, int loc) { CoreLabel p3 = cInfo.get(loc - 3);//from w w w. j a va 2 s .c om CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p = cInfo.get(loc - 1); CoreLabel c = cInfo.get(loc); CoreLabel n = cInfo.get(loc + 1); CoreLabel n2 = cInfo.get(loc + 2); String cWord = getWord(c); String pWord = getWord(p); String nWord = getWord(n); String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class); String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class); String nShape = n.getString(CoreAnnotations.ShapeAnnotation.class); Collection<String> featuresC = new ArrayList<String>(); if (flags.useDistSim) { distSimAnnotate(cInfo); } if (flags.useBagOfWords) { for (IN word : cInfo) { featuresC.add(getWord(word) + "-BAGOFWORDS"); } } if (flags.useDistSim && flags.useMoreTags) { featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + cWord + "-PDISTSIM-CWORD"); } if (flags.useDistSim) { featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM"); } if (flags.useTitle) { Matcher m = titlePattern.matcher(cWord); if (m.matches()) { featuresC.add("IS_TITLE"); } } if (flags.useInternal && flags.useExternal) { if (flags.useWord) { featuresC.add(cWord + "-WORD"); } if (flags.use2W) { featuresC.add(getWord(p2) + "-P2W"); featuresC.add(getWord(n2) + "-N2W"); } if (flags.useLC) { featuresC.add(cWord.toLowerCase() + "-CL"); featuresC.add(pWord.toLowerCase() + "-PL"); featuresC.add(nWord.toLowerCase() + "-NL"); } if (flags.useUnknown) { // for true casing featuresC.add(c.get(CoreAnnotations.UnknownAnnotation.class) + "-UNKNOWN"); featuresC.add(p.get(CoreAnnotations.UnknownAnnotation.class) + "-PUNKNOWN"); featuresC.add(n.get(CoreAnnotations.UnknownAnnotation.class) + "-NUNKNOWN"); } if (flags.useLemmas) { String lem = c.getString(CoreAnnotations.LemmaAnnotation.class); if (!"".equals(lem)) { featuresC.add(lem + "-LEM"); } } if (flags.usePrevNextLemmas) { String plem = p.getString(CoreAnnotations.LemmaAnnotation.class); String nlem = n.getString(CoreAnnotations.LemmaAnnotation.class); if (!"".equals(plem)) { featuresC.add(plem + "-PLEM"); } if (!"".equals(nlem)) { featuresC.add(nlem + "-NLEM"); } } if (flags.checkNameList) { try { if (lastNames == null) { lastNames = Generics.newHashSet(); for (String line : ObjectBank.getLineIterator(flags.lastNameList)) { String[] cols = line.split("\\s+"); lastNames.add(cols[0]); } } if (maleNames == null) { maleNames = Generics.newHashSet(); for (String line : ObjectBank.getLineIterator(flags.maleNameList)) { String[] cols = line.split("\\s+"); maleNames.add(cols[0]); } } if (femaleNames == null) { femaleNames = Generics.newHashSet(); for (String line : ObjectBank.getLineIterator(flags.femaleNameList)) { String[] cols = line.split("\\s+"); femaleNames.add(cols[0]); } } String name = cWord.toUpperCase(); if (lastNames.contains(name)) { featuresC.add("LAST_NAME"); } if (maleNames.contains(name)) { featuresC.add("MALE_NAME"); } if (femaleNames.contains(name)) { featuresC.add("FEMALE_NAME"); } } catch (Exception e) { throw new RuntimeException(e); } } if (flags.binnedLengths != null) { int len = cWord.length(); String featureName = null; for (int i = 0; i <= flags.binnedLengths.length; i++) { if (i == flags.binnedLengths.length) { featureName = "Len-" + flags.binnedLengths[flags.binnedLengths.length - 1] + "-Inf"; } else if (len <= flags.binnedLengths[i]) { featureName = "Len-" + ((i == 0) ? 1 : flags.binnedLengths[i - 1]) + '-' + flags.binnedLengths[i]; break; } } featuresC.add(featureName); } if (flags.useABGENE) { featuresC.add(c.get(CoreAnnotations.AbgeneAnnotation.class) + "-ABGENE"); featuresC.add(p.get(CoreAnnotations.AbgeneAnnotation.class) + "-PABGENE"); featuresC.add(n.get(CoreAnnotations.AbgeneAnnotation.class) + "-NABGENE"); } if (flags.useABSTRFreqDict) { featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); } if (flags.useABSTR) { featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT"); featuresC.add(p.get(CoreAnnotations.AbstrAnnotation.class) + "-PABSTRACT"); featuresC.add(n.get(CoreAnnotations.AbstrAnnotation.class) + "-NABSTRACT"); } if (flags.useGENIA) { featuresC.add(c.get(CoreAnnotations.GeniaAnnotation.class) + "-GENIA"); featuresC.add(p.get(CoreAnnotations.GeniaAnnotation.class) + "-PGENIA"); featuresC.add(n.get(CoreAnnotations.GeniaAnnotation.class) + "-NGENIA"); } if (flags.useWEBFreqDict) { featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB" + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); } if (flags.useWEB) { featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB"); featuresC.add(p.get(CoreAnnotations.WebAnnotation.class) + "-PWEB"); featuresC.add(n.get(CoreAnnotations.WebAnnotation.class) + "-NWEB"); } if (flags.useIsURL) { featuresC.add(c.get(CoreAnnotations.IsURLAnnotation.class) + "-ISURL"); } if (flags.useEntityRule) { featuresC.add(c.get(CoreAnnotations.EntityRuleAnnotation.class) + "-ENTITYRULE"); } if (flags.useEntityTypes) { featuresC.add(c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ENTITYTYPE"); } if (flags.useIsDateRange) { featuresC.add(c.get(CoreAnnotations.IsDateRangeAnnotation.class) + "-ISDATERANGE"); } if (flags.useABSTRFreq) { featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT" + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"); } if (flags.useFREQ) { featuresC.add(c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"); } if (flags.useMoreTags) { featuresC.add( p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + cWord + "-PTAG-CWORD"); } if (flags.usePosition) { featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + "-POSITION"); } if (flags.useBeginSent) { String pos = c.get(CoreAnnotations.PositionAnnotation.class); if ("0".equals(pos)) { featuresC.add("BEGIN-SENT"); featuresC.add(cShape + "-BEGIN-SENT"); } else if (Integer.toString(cInfo.size() - 1).equals(pos)) { featuresC.add("END-SENT"); featuresC.add(cShape + "-END-SENT"); } else { featuresC.add("IN-SENT"); featuresC.add(cShape + "-IN-SENT"); } } if (flags.useTags) { featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG"); } if (flags.useOrdinal) { if (isOrdinal(cInfo, loc)) { featuresC.add("C_ORDINAL"); if (isOrdinal(cInfo, loc - 1)) { //System.err.print(getWord(p) + " "); featuresC.add("PC_ORDINAL"); } //System.err.println(cWord); } if (isOrdinal(cInfo, loc - 1)) { featuresC.add("P_ORDINAL"); } } if (flags.usePrev) { featuresC.add(pWord + "-PW"); if (flags.useTags) { featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PTAG"); } if (flags.useDistSim) { featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + "-PDISTSIM"); } if (flags.useIsURL) { featuresC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + "-PISURL"); } if (flags.useEntityTypes) { featuresC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + "-PENTITYTYPE"); } } if (flags.useNext) { featuresC.add(nWord + "-NW"); if (flags.useTags) { featuresC.add(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-NTAG"); } if (flags.useDistSim) { featuresC.add(n.get(CoreAnnotations.DistSimAnnotation.class) + "-NDISTSIM"); } if (flags.useIsURL) { featuresC.add(n.get(CoreAnnotations.IsURLAnnotation.class) + "-NISURL"); } if (flags.useEntityTypes) { featuresC.add(n.get(CoreAnnotations.EntityTypeAnnotation.class) + "-NENTITYTYPE"); } } /*here, entityTypes refers to the type in the PASCAL IE challenge: * i.e. certain words are tagged "Date" or "Location" */ if (flags.useEitherSideWord) { featuresC.add(pWord + "-EW"); featuresC.add(nWord + "-EW"); } if (flags.useWordPairs) { featuresC.add(cWord + '-' + pWord + "-W-PW"); featuresC.add(cWord + '-' + nWord + "-W-NW"); } if (flags.useSymTags) { if (flags.useTags) { featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCNTAGS"); featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-CNTAGS"); featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCTAGS"); } if (flags.useDistSim) { featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + n.get(CoreAnnotations.DistSimAnnotation.class) + "-PCNDISTSIM"); featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + n.get(CoreAnnotations.DistSimAnnotation.class) + "-CNDISTSIM"); featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-PCDISTSIM"); } } if (flags.useSymWordPairs) { featuresC.add(pWord + '-' + nWord + "-SWORDS"); } String pGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? p.get(CoreAnnotations.GazAnnotation.class) : null; String nGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? n.get(CoreAnnotations.GazAnnotation.class) : null; String cGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures) ? c.get(CoreAnnotations.GazAnnotation.class) : null; if (flags.useGazFeatures) { if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) { featuresC.add(cGazAnnotation + "-GAZ"); } // n if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) { featuresC.add(nGazAnnotation + "-NGAZ"); } // p if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) { featuresC.add(pGazAnnotation + "-PGAZ"); } } if (flags.useMoreGazFeatures) { if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) { featuresC.add(cGazAnnotation + '-' + cWord + "-CG-CW-GAZ"); // c-n if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) { featuresC.add(cGazAnnotation + '-' + nGazAnnotation + "-CNGAZ"); } // p-c if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) { featuresC.add(pGazAnnotation + '-' + cGazAnnotation + "-PCGAZ"); } } } if (flags.useAbbr || flags.useMinimalAbbr) { featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR"); } if (flags.useAbbr1 || flags.useMinimalAbbr1) { if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR"); } } if (flags.useAbbr) { featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR"); featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR"); featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR"); } if (flags.useAbbr1) { if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR"); featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR"); featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + '-' + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR"); } } if (flags.useChunks) { featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + "-PCCHUNK"); featuresC.add(c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-CNCHUNK"); featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK"); } if (flags.useMinimalAbbr) { featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB"); } if (flags.useMinimalAbbr1) { if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB"); } } String prevVB = "", nextVB = ""; if (flags.usePrevVB) { for (int j = loc - 1;; j--) { CoreLabel wi = cInfo.get(j); if (wi == cInfo.getPad()) { prevVB = "X"; featuresC.add("X-PVB"); break; } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) { featuresC.add(getWord(wi) + "-PVB"); prevVB = getWord(wi); break; } } } if (flags.useNextVB) { for (int j = loc + 1;; j++) { CoreLabel wi = cInfo.get(j); if (wi == cInfo.getPad()) { featuresC.add("X-NVB"); nextVB = "X"; break; } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) { featuresC.add(getWord(wi) + "-NVB"); nextVB = getWord(wi); break; } } } if (flags.useVB) { featuresC.add(prevVB + '-' + nextVB + "-PNVB"); } if (flags.useShapeConjunctions) { featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + cShape + "-POS-SH"); if (flags.useTags) { featuresC.add(c.tag() + cShape + "-TAG-SH"); } if (flags.useDistSim) { featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + cShape + "-DISTSIM-SH"); } } if (flags.useWordTag) { featuresC.add(cWord + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-T"); featuresC.add(cWord + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-PT"); featuresC.add(cWord + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-NT"); } if (flags.useNPHead) { featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-HW"); if (flags.useTags) { featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-HW-T"); } if (flags.useDistSim) { featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.get(CoreAnnotations.DistSimAnnotation.class) + "-HW-DISTSIM"); } } if (flags.useNPGovernor) { featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + "-GW"); if (flags.useTags) { featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-GW-T"); } if (flags.useDistSim) { featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM-T1"); } } if (flags.useHeadGov) { featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-" + c.get(CoreAnnotations.GovernorAnnotation.class) + "-HW_GW"); } if (flags.useClassFeature) { featuresC.add("###"); } if (flags.useFirstWord) { String firstWord = getWord(cInfo.get(0)); featuresC.add(firstWord); } if (flags.useNGrams) { Collection<String> subs = null; if (flags.cacheNGrams) { subs = wordToSubstrings.get(cWord); } if (subs == null) { subs = new ArrayList<String>(); String word = '<' + cWord + '>'; if (flags.lowercaseNGrams) { word = word.toLowerCase(); } if (flags.dehyphenateNGrams) { word = dehyphenate(word); } if (flags.greekifyNGrams) { word = greekify(word); } // minimum length substring is 2 letters (hardwired) // hoist flags.noMidNGrams so only linear in word length for that case if (flags.noMidNGrams) { int max = flags.maxNGramLeng >= 0 ? Math.min(flags.maxNGramLeng, word.length()) : word.length(); for (int j = 2; j <= max; j++) { subs.add(intern('#' + word.substring(0, j) + '#')); } int start = flags.maxNGramLeng >= 0 ? Math.max(0, word.length() - flags.maxNGramLeng) : 0; int lenM1 = word.length() - 1; for (int i = start; i < lenM1; i++) { subs.add(intern('#' + word.substring(i) + '#')); } } else { for (int i = 0; i < word.length(); i++) { for (int j = i + 2, max = Math.min(word.length(), i + flags.maxNGramLeng); j <= max; j++) { if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) { continue; } subs.add(intern('#' + word.substring(i, j) + '#')); } } } if (flags.cacheNGrams) { wordToSubstrings.put(cWord, subs); } } featuresC.addAll(subs); if (flags.conjoinShapeNGrams) { for (String str : subs) { String feat = str + '-' + cShape + "-CNGram-CS"; featuresC.add(feat); } } } if (flags.useGazettes) { if (flags.sloppyGazette) { Collection<String> entries = wordToGazetteEntries.get(cWord); if (entries != null) { featuresC.addAll(entries); } } if (flags.cleanGazette) { Collection<GazetteInfo> infos = wordToGazetteInfos.get(cWord); if (infos != null) { for (GazetteInfo gInfo : infos) { boolean ok = true; for (int gLoc = 0; gLoc < gInfo.words.length; gLoc++) { ok &= gInfo.words[gLoc].equals(getWord(cInfo.get(loc + gLoc - gInfo.loc))); } if (ok) { featuresC.add(gInfo.feature); } } } } } if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) { featuresC.add(cShape + "-TYPE"); if (flags.useTypeSeqs) { featuresC.add(pShape + "-PTYPE"); featuresC.add(nShape + "-NTYPE"); featuresC.add(pWord + "..." + cShape + "-PW_CTYPE"); featuresC.add(cShape + "..." + nWord + "-NW_CTYPE"); featuresC.add(pShape + "..." + cShape + "-PCTYPE"); featuresC.add(cShape + "..." + nShape + "-CNTYPE"); featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE"); } } if (flags.useLastRealWord) { if (pWord.length() <= 3) { // extending this to check for 2 short words doesn't seem to help.... featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE"); } } if (flags.useNextRealWord) { if (nWord.length() <= 3) { // extending this to check for 2 short words doesn't seem to help.... featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE"); } } if (flags.useOccurrencePatterns) { featuresC.addAll(occurrencePatterns(cInfo, loc)); } if (flags.useDisjunctive) { for (int i = 1; i <= flags.disjunctionWidth; i++) { CoreLabel dn = cInfo.get(loc + i); CoreLabel dp = cInfo.get(loc - i); featuresC.add(getWord(dn) + "-DISJN"); if (flags.useDisjunctiveShapeInteraction) { featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS"); } featuresC.add(getWord(dp) + "-DISJP"); if (flags.useDisjunctiveShapeInteraction) { featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS"); } } } if (flags.useWideDisjunctive) { for (int i = 1; i <= flags.wideDisjunctionWidth; i++) { featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN"); featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP"); } } if (flags.useEitherSideDisjunctive) { for (int i = 1; i <= flags.disjunctionWidth; i++) { featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWE"); featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWE"); } } if (flags.useDisjShape) { for (int i = 1; i <= flags.disjunctionWidth; i++) { featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE"); // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE"); featuresC.add(cShape + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE"); // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE"); } } if (flags.useExtraTaggySequences) { if (flags.useTags) { featuresC.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS"); featuresC.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS"); } if (flags.useDistSim) { featuresC.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1"); featuresC.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1"); } } if (flags.useMUCFeatures) { featuresC.add(c.get(CoreAnnotations.SectionAnnotation.class) + "-SECTION"); featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class) + "-WORD_POSITION"); featuresC.add(c.get(CoreAnnotations.SentencePositionAnnotation.class) + "-SENT_POSITION"); featuresC.add(c.get(CoreAnnotations.ParaPositionAnnotation.class) + "-PARA_POSITION"); featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-WORD_POSITION_SHAPE"); } } else if (flags.useInternal) { if (flags.useWord) { featuresC.add(cWord + "-WORD"); } if (flags.useNGrams) { Collection<String> subs = wordToSubstrings.get(cWord); if (subs == null) { subs = new ArrayList<String>(); String word = '<' + cWord + '>'; if (flags.lowercaseNGrams) { word = word.toLowerCase(); } if (flags.dehyphenateNGrams) { word = dehyphenate(word); } if (flags.greekifyNGrams) { word = greekify(word); } for (int i = 0; i < word.length(); i++) { for (int j = i + 2; j <= word.length(); j++) { if (flags.noMidNGrams && i != 0 && j != word.length()) { continue; } if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) { continue; } //subs.add(intern("#" + word.substring(i, j) + "#")); subs.add(intern('#' + word.substring(i, j) + '#')); } } if (flags.cacheNGrams) { wordToSubstrings.put(cWord, subs); } } featuresC.addAll(subs); if (flags.conjoinShapeNGrams) { String shape = c.get(CoreAnnotations.ShapeAnnotation.class); for (String str : subs) { String feat = str + '-' + shape + "-CNGram-CS"; featuresC.add(feat); } } } if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) { featuresC.add(cShape + "-TYPE"); } if (flags.useOccurrencePatterns) { featuresC.addAll(occurrencePatterns(cInfo, loc)); } } else if (flags.useExternal) { if (flags.usePrev) { featuresC.add(pWord + "-PW"); } if (flags.useNext) { featuresC.add(nWord + "-NW"); } if (flags.useWordPairs) { featuresC.add(cWord + '-' + pWord + "-W-PW"); featuresC.add(cWord + '-' + nWord + "-W-NW"); } if (flags.useSymWordPairs) { featuresC.add(pWord + '-' + nWord + "-SWORDS"); } if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) { if (flags.useTypeSeqs) { featuresC.add(pShape + "-PTYPE"); featuresC.add(nShape + "-NTYPE"); featuresC.add(pWord + "..." + cShape + "-PW_CTYPE"); featuresC.add(cShape + "..." + nWord + "-NW_CTYPE"); if (flags.maxLeft > 0) featuresC.add(pShape + "..." + cShape + "-PCTYPE"); // this one just isn't useful, at least given c,pc,s,ps. Might be useful 0th-order featuresC.add(cShape + "..." + nShape + "-CNTYPE"); featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE"); } } if (flags.useLastRealWord) { if (pWord.length() <= 3) { featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE"); } } if (flags.useNextRealWord) { if (nWord.length() <= 3) { featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE"); } } if (flags.useDisjunctive) { for (int i = 1; i <= flags.disjunctionWidth; i++) { CoreLabel dn = cInfo.get(loc + i); CoreLabel dp = cInfo.get(loc - i); featuresC.add(getWord(dn) + "-DISJN"); if (flags.useDisjunctiveShapeInteraction) { featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS"); } featuresC.add(getWord(dp) + "-DISJP"); if (flags.useDisjunctiveShapeInteraction) { featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS"); } } } if (flags.useWideDisjunctive) { for (int i = 1; i <= flags.wideDisjunctionWidth; i++) { featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN"); featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP"); } } if (flags.useDisjShape) { for (int i = 1; i <= flags.disjunctionWidth; i++) { featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE"); // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE"); featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE"); // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE"); } } } // Stuff to add binary features from the additional columns if (flags.twoStage) { featuresC.add(c.get(Bin1Annotation.class) + "-BIN1"); featuresC.add(c.get(Bin2Annotation.class) + "-BIN2"); featuresC.add(c.get(Bin3Annotation.class) + "-BIN3"); featuresC.add(c.get(Bin4Annotation.class) + "-BIN4"); featuresC.add(c.get(Bin5Annotation.class) + "-BIN5"); featuresC.add(c.get(Bin6Annotation.class) + "-BIN6"); } if (flags.useIfInteger) { try { int val = Integer.parseInt(cWord); if (val > 0) featuresC.add("POSITIVE_INTEGER"); else if (val < 0) featuresC.add("NEGATIVE_INTEGER"); // System.err.println("FOUND INTEGER"); } catch (NumberFormatException e) { // not an integer value, nothing to do } } //Stuff to add arbitrary features if (flags.useGenericFeatures) { //see if we need to cache the keys if (genericAnnotationKeys == null) { makeGenericKeyCache(c); } //now look through the cached keys for (Class key : genericAnnotationKeys) { //System.err.println("Adding feature: " + CoreLabel.genericValues.get(key) + " with value " + c.get(key)); if (c.get(key) != null && c.get(key) instanceof Collection) { for (Object ob : (Collection) c.get(key)) { featuresC.add(ob + "-" + CoreLabel.genericValues.get(key)); } } else { featuresC.add(c.get(key) + "-" + CoreLabel.genericValues.get(key)); } } } if (flags.useTopics) { //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + cWord + "--CWORD"); featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + "-TopicID"); featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + "-PTopicID"); featuresC.add(n.get(CoreAnnotations.TopicAnnotation.class) + "-NTopicID"); //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-PCNTopicID"); //featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-CNTopicID"); //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + "-PCTopicID"); //featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + cShape + "-TopicID-SH"); //asdasd } // NER tag annotations from a previous NER system if (c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) != null) { featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CStackedNERTag"); featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-WCStackedNERTag"); if (flags.useNext) { featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CNStackedNERTag"); featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-WCNStackedNERTag"); if (flags.usePrev) { featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCNStackedNERTag"); featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + cWord + " -" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PWCNStackedNERTag"); } } if (flags.usePrev) { featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCStackedNERTag"); } } if (flags.useWordnetFeatures) featuresC.add(c.get(CoreAnnotations.WordnetSynAnnotation.class) + "-WordnetSyn"); if (flags.useProtoFeatures) featuresC.add(c.get(CoreAnnotations.ProtoAnnotation.class) + "-Proto"); if (flags.usePhraseWordTags) featuresC.add(c.get(CoreAnnotations.PhraseWordsTagAnnotation.class) + "-PhraseTag"); if (flags.usePhraseWords) { for (String w : c.get(CoreAnnotations.PhraseWordsAnnotation.class)) featuresC.add(w + "-PhraseWord"); } if (flags.useCommonWordsFeature) featuresC.add(c.get(CoreAnnotations.CommonWordsAnnotation.class)); if (flags.useRadical && cWord.length() > 0) { if (cWord.length() == 1) { featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + "-SINGLE-CHAR-RADICAL"); } else { featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + "-START-RADICAL"); featuresC.add(RadicalMap.getRadical(cWord.charAt(cWord.length() - 1)) + "-END-RADICAL"); } for (int i = 0; i < cWord.length(); ++i) { featuresC.add(RadicalMap.getRadical(cWord.charAt(i)) + "-RADICAL"); } } if (flags.splitWordRegex != null && !flags.splitWordRegex.isEmpty()) { String[] ws = c.word().split(flags.splitWordRegex); for (String s : ws) { featuresC.add(s + "-SPLITWORD"); } } return featuresC; }
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
protected Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) { CoreLabel p = cInfo.get(loc - 1);//from w w w . ja v a 2 s. com CoreLabel c = cInfo.get(loc); CoreLabel n = cInfo.get(loc + 1); String cWord = getWord(c); String pWord = getWord(p); String cDS = c.getString(CoreAnnotations.DistSimAnnotation.class); String pDS = p.getString(CoreAnnotations.DistSimAnnotation.class); String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class); String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class); Collection<String> featuresCpC = new ArrayList<String>(); if (flags.noEdgeFeature) return featuresCpC; if (flags.transitionEdgeOnly) { featuresCpC.add("PSEQ"); return featuresCpC; } if (flags.useNeighborNGrams) { int maxLen = pWord.length(); if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) { maxLen = flags.maxNGramLeng; } for (int len = 1; len <= maxLen; ++len) { featuresCpC.add(pWord.substring(0, len) + "-PREVIOUS-PREFIX"); } for (int pos = pWord.length() - maxLen; pos < pWord.length(); ++pos) { featuresCpC.add(pWord.substring(pos, pWord.length()) + "-PREVIOUS-SUFFIX"); } maxLen = cWord.length(); if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) { maxLen = flags.maxNGramLeng; } for (int len = 1; len <= maxLen; ++len) { featuresCpC.add(cWord.substring(0, len) + "-CURRENT-PREFIX"); } for (int pos = cWord.length() - maxLen; pos < cWord.length(); ++pos) { featuresCpC.add(cWord.substring(pos, cWord.length()) + "-CURRENT-SUFFIX"); } } if (flags.useInternal && flags.useExternal) { if (flags.useOrdinal) { if (isOrdinal(cInfo, loc)) { featuresCpC.add("C_ORDINAL"); if (isOrdinal(cInfo, loc - 1)) { featuresCpC.add("PC_ORDINAL"); } } if (isOrdinal(cInfo, loc - 1)) { featuresCpC.add("P_ORDINAL"); } } if (flags.useAbbr || flags.useMinimalAbbr) { featuresCpC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PABBRANS"); } if (flags.useAbbr1 || flags.useMinimalAbbr1) { if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) { featuresCpC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PABBRANS"); } } if (flags.useChunkySequences) { featuresCpC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + '-' + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK"); } if (flags.usePrev) { if (flags.useSequences && flags.usePrevSequences) { featuresCpC.add("PSEQ"); featuresCpC.add(cWord + "-PSEQW"); /*if ( ! flags.strictGoodCoNLL) { featuresCpC.add(pWord+ '-' +cWord + "-PSEQW2"); // added later after goodCoNLL featuresCpC.add(pWord + "-PSEQpW"); // added later after goodCoNLL } if (flags.useDistSim) { featuresCpC.add(pDS + "-PSEQpDS"); featuresCpC.add(cDS + "-PSEQcDS"); featuresCpC.add(pDS+ '-' +cDS + "-PSEQpcDS"); } if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings)) { if ( ! flags.strictGoodCoNLL) { // These ones were added later after goodCoNLL featuresCpC.add(pShape + "-PSEQpS"); featuresCpC.add(cShape + "-PSEQcS"); } if (flags.strictGoodCoNLL && ! flags.removeStrictGoodCoNLLDuplicates) { featuresCpC.add(pShape + '-' + cShape + "-PSEQpcS"); // Duplicate (in goodCoNLL orig, see -TYPES below) } }*/ } } if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs && (flags.useTypeSeqs2 || flags.useTypeSeqs3)) { if (flags.useTypeSeqs3) { featuresCpC.add(pShape + '-' + cShape + '-' + n.get(CoreAnnotations.ShapeAnnotation.class) + "-PCNSHAPES"); } if (flags.useTypeSeqs2) { featuresCpC.add(pShape + '-' + cShape + "-TYPES"); // this duplicates PSEQpcS above } if (flags.useYetMoreCpCShapes) { String p2Shape = cInfo.get(loc - 2).getString(CoreAnnotations.ShapeAnnotation.class); featuresCpC.add(p2Shape + '-' + pShape + '-' + cShape + "-YMS"); featuresCpC.add(pShape + '-' + cShape + "-" + n.getString(CoreAnnotations.ShapeAnnotation.class) + "-YMSPCN"); } } if (flags.useTypeySequences) { featuresCpC.add(cShape + "-TPS2"); featuresCpC.add(n.get(CoreAnnotations.ShapeAnnotation.class) + "-TNS1"); // featuresCpC.add(pShape) + "-" + cShape) + "-TPS"); // duplicates -TYPES, so now omitted; you may need to slightly increase sigma to duplicate previous results, however. } if (flags.useTaggySequences) { if (flags.useTags) { featuresCpC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TS"); } if (flags.useDistSim) { featuresCpC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TS1"); } } if (flags.useParenMatching) { if (flags.useReverse) { if (cWord.equals("(") || cWord.equals("[") || cWord.equals("-LRB-")) { if (pWord.equals(")") || pWord.equals("]") || pWord.equals("-RRB-")) { featuresCpC.add("PAREN-MATCH"); } } } else { if (cWord.equals(")") || cWord.equals("]") || cWord.equals("-RRB-")) { if (pWord.equals("(") || pWord.equals("[") || pWord.equals("-LRB-")) { featuresCpC.add("PAREN-MATCH"); } } } } if (flags.useEntityTypeSequences) { featuresCpC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + '-' + c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ETSEQ"); } if (flags.useURLSequences) { featuresCpC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + '-' + c.get(CoreAnnotations.IsURLAnnotation.class) + "-URLSEQ"); } } else if (flags.useInternal) { if (flags.useSequences && flags.usePrevSequences) { featuresCpC.add("PSEQ"); featuresCpC.add(cWord + "-PSEQW"); } if (flags.useTypeySequences) { featuresCpC.add(cShape + "-TPS2"); } } else if (flags.useExternal) { if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs && (flags.useTypeSeqs2 || flags.useTypeSeqs3)) { if (flags.useTypeSeqs3) { featuresCpC.add(pShape + '-' + cShape + '-' + n.get(CoreAnnotations.ShapeAnnotation.class) + "-PCNSHAPES"); } if (flags.useTypeSeqs2) { featuresCpC.add(pShape + '-' + cShape + "-TYPES"); } } if (flags.useTypeySequences) { featuresCpC.add(n.get(CoreAnnotations.ShapeAnnotation.class) + "-TNS1"); featuresCpC.add(pShape + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TPS"); } } return featuresCpC; }
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
protected Collection<String> featuresCpCp2C(PaddedList<IN> cInfo, int loc) { CoreLabel c = cInfo.get(loc);/*from ww w . ja v a 2 s .c o m*/ CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); String pWord = getWord(p); // String p2Word = getWord(p2); Collection<String> featuresCpCp2C = new ArrayList<String>(); if (flags.useInternal && flags.useExternal) { /*if (flags.strictGoodCoNLL && ! flags.removeStrictGoodCoNLLDuplicates && flags.useTypeySequences && flags.maxLeft >= 2) { // this feature duplicates -TYPETYPES below, so probably don't include it, but it was in original tests of CMM goodCoNLL featuresCpCp2C.add(p2.get(CoreAnnotations.ShapeAnnotation.class) + '-' + p.get(CoreAnnotations.ShapeAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTPS"); }*/ if (flags.useAbbr) { featuresCpCp2C.add(p2.get(CoreAnnotations.AbbrAnnotation.class) + '-' + p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-2PABBRANS"); } if (flags.useChunks) { featuresCpCp2C.add(p2.get(CoreAnnotations.ChunkAnnotation.class) + '-' + p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + "-2PCHUNKS"); } if (flags.useLongSequences) { featuresCpCp2C.add("PPSEQ"); } if (flags.useBoundarySequences && pWord.equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) { featuresCpCp2C.add("BNDRY-SPAN-PPSEQ"); } // This more complex consistency checker didn't help! // if (flags.useBoundarySequences) { // // try enforce consistency over "and" and "," as well as boundary // if (pWord.equals(CoNLLDocumentIteratorFactory.BOUNDARY) || // pWord.equalsIgnoreCase("and") || pWord.equalsIgnoreCase("or") || // pWord.equals(",")) { // } // } if (flags.useTaggySequences) { if (flags.useTags) { featuresCpCp2C.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS"); if (flags.useTaggySequencesShapeInteraction) { featuresCpCp2C.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTS-CS"); } } if (flags.useDistSim) { featuresCpCp2C.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1"); if (flags.useTaggySequencesShapeInteraction) { featuresCpCp2C.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-DISTSIM_TTS1-CS"); } } } if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs && flags.useTypeSeqs2 && flags.maxLeft >= 2) { String cShape = c.get(CoreAnnotations.ShapeAnnotation.class); String pShape = p.get(CoreAnnotations.ShapeAnnotation.class); String p2Shape = p2.get(CoreAnnotations.ShapeAnnotation.class); featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES"); } } else if (flags.useInternal) { if (flags.useLongSequences) { featuresCpCp2C.add("PPSEQ"); } } else if (flags.useExternal) { if (flags.useLongSequences) { featuresCpCp2C.add("PPSEQ"); } if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs && flags.useTypeSeqs2 && flags.maxLeft >= 2) { String cShape = c.get(CoreAnnotations.ShapeAnnotation.class); String pShape = p.get(CoreAnnotations.ShapeAnnotation.class); String p2Shape = p2.get(CoreAnnotations.ShapeAnnotation.class); featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES"); } } return featuresCpCp2C; }