List of usage examples for edu.stanford.nlp.sequences CoNLLDocumentReaderAndWriter BOUNDARY
String BOUNDARY
To view the source code for edu.stanford.nlp.sequences CoNLLDocumentReaderAndWriter BOUNDARY.
Click Source Link
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
protected Collection<String> featuresCpCp2C(PaddedList<IN> cInfo, int loc) { CoreLabel c = cInfo.get(loc);// w w w . j av a 2 s . co m CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); String pWord = getWord(p); // String p2Word = getWord(p2); Collection<String> featuresCpCp2C = new ArrayList<String>(); if (flags.useInternal && flags.useExternal) { /*if (flags.strictGoodCoNLL && ! flags.removeStrictGoodCoNLLDuplicates && flags.useTypeySequences && flags.maxLeft >= 2) { // this feature duplicates -TYPETYPES below, so probably don't include it, but it was in original tests of CMM goodCoNLL featuresCpCp2C.add(p2.get(CoreAnnotations.ShapeAnnotation.class) + '-' + p.get(CoreAnnotations.ShapeAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTPS"); }*/ if (flags.useAbbr) { featuresCpCp2C.add(p2.get(CoreAnnotations.AbbrAnnotation.class) + '-' + p.get(CoreAnnotations.AbbrAnnotation.class) + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-2PABBRANS"); } if (flags.useChunks) { featuresCpCp2C.add(p2.get(CoreAnnotations.ChunkAnnotation.class) + '-' + p.get(CoreAnnotations.ChunkAnnotation.class) + '-' + c.get(CoreAnnotations.ChunkAnnotation.class) + "-2PCHUNKS"); } if (flags.useLongSequences) { featuresCpCp2C.add("PPSEQ"); } if (flags.useBoundarySequences && pWord.equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) { featuresCpCp2C.add("BNDRY-SPAN-PPSEQ"); } // This more complex consistency checker didn't help! // if (flags.useBoundarySequences) { // // try enforce consistency over "and" and "," as well as boundary // if (pWord.equals(CoNLLDocumentIteratorFactory.BOUNDARY) || // pWord.equalsIgnoreCase("and") || pWord.equalsIgnoreCase("or") || // pWord.equals(",")) { // } // } if (flags.useTaggySequences) { if (flags.useTags) { featuresCpCp2C.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS"); if (flags.useTaggySequencesShapeInteraction) { featuresCpCp2C.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTS-CS"); } } if (flags.useDistSim) { featuresCpCp2C.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1"); if (flags.useTaggySequencesShapeInteraction) { featuresCpCp2C.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-DISTSIM_TTS1-CS"); } } } if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs && flags.useTypeSeqs2 && flags.maxLeft >= 2) { String cShape = c.get(CoreAnnotations.ShapeAnnotation.class); String pShape = p.get(CoreAnnotations.ShapeAnnotation.class); String p2Shape = p2.get(CoreAnnotations.ShapeAnnotation.class); featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES"); } } else if (flags.useInternal) { if (flags.useLongSequences) { featuresCpCp2C.add("PPSEQ"); } } else if (flags.useExternal) { if (flags.useLongSequences) { featuresCpCp2C.add("PPSEQ"); } if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs && flags.useTypeSeqs2 && flags.maxLeft >= 2) { String cShape = c.get(CoreAnnotations.ShapeAnnotation.class); String pShape = p.get(CoreAnnotations.ShapeAnnotation.class); String p2Shape = p2.get(CoreAnnotations.ShapeAnnotation.class); featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES"); } } return featuresCpCp2C; }
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
protected Collection<String> featuresCpCp2Cp3C(PaddedList<IN> cInfo, int loc) { CoreLabel c = cInfo.get(loc);/* w w w . j a va 2 s . com*/ CoreLabel p = cInfo.get(loc - 1); CoreLabel p2 = cInfo.get(loc - 2); CoreLabel p3 = cInfo.get(loc - 3); Collection<String> featuresCpCp2Cp3C = new ArrayList<String>(); if (flags.useTaggySequences) { if (flags.useTags) { if (flags.maxLeft >= 3 && !flags.dontExtendTaggy) { featuresCpCp2Cp3C.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS"); if (flags.useTaggySequencesShapeInteraction) { featuresCpCp2Cp3C.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTTS-CS"); } } } if (flags.useDistSim) { if (flags.maxLeft >= 3 && !flags.dontExtendTaggy) { featuresCpCp2Cp3C.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1"); if (flags.useTaggySequencesShapeInteraction) { featuresCpCp2Cp3C.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-' + p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.DistSimAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-DISTSIM_TTTS1-CS"); } } } } if (flags.maxLeft >= 3) { if (flags.useLongSequences) { featuresCpCp2Cp3C.add("PPPSEQ"); } if (flags.useBoundarySequences && getWord(p).equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) { featuresCpCp2Cp3C.add("BNDRY-SPAN-PPPSEQ"); } } return featuresCpCp2Cp3C; }
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
protected Collection<String> featuresCpCp2Cp3Cp4C(PaddedList<IN> cInfo, int loc) { Collection<String> featuresCpCp2Cp3Cp4C = new ArrayList<String>(); CoreLabel p = cInfo.get(loc - 1);//w ww . j a va 2 s. co m if (flags.maxLeft >= 4) { if (flags.useLongSequences) { featuresCpCp2Cp3Cp4C.add("PPPPSEQ"); } if (flags.useBoundarySequences && getWord(p).equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) { featuresCpCp2Cp3Cp4C.add("BNDRY-SPAN-PPPPSEQ"); } } return featuresCpCp2Cp3Cp4C; }