Example usage for edu.stanford.nlp.ling CoreLabel tag

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling CoreLabel tag.

Prototype

@Override
public String tag()

Source Link

Usage

From source file:SentencePair.java

License:Open Source License

private void createSentence(String text, List<POSTaggedToken> sentence) {
    Annotation d = new Annotation(text);
    nlp.annotate(d);/*from  w ww. j  a v a2 s.  c  om*/

    for (CoreMap ss : d.get(CoreAnnotations.SentencesAnnotation.class)) {
        for (CoreLabel token : ss.get(CoreAnnotations.TokensAnnotation.class)) {
            sentence.add(new POSTaggedToken(token.toString(), translateTag(token.tag())));
        }
    }
}

From source file:conditionalCFG.ConditionalCFGParser.java

License:Open Source License

private Tree extractBestParse(int goal, int start, int end) {
    // find source of inside score
    // no backtraces so we can speed up the parsing for its primary use
    double bestScore = iScore[start][end][goal];
    double normBestScore = op.testOptions.lengthNormalization ? (bestScore / wordsInSpan[start][end][goal])
            : bestScore;// w w  w  .j a v  a  2  s .  c  om
    String goalStr = stateIndex.get(goal);

    // check tags
    if (end - start <= op.testOptions.maxSpanForTags && tagIndex.contains(goalStr)) {
        if (op.testOptions.maxSpanForTags > 1) {
            Tree wordNode = null;
            if (sentence != null) {
                StringBuilder word = new StringBuilder();
                for (int i = start; i < end; i++) {
                    if (sentence.get(i) instanceof HasWord) {
                        HasWord cl = (HasWord) sentence.get(i);
                        word.append(cl.word());
                    } else {
                        word.append(sentence.get(i).toString());
                    }
                }
                wordNode = tf.newLeaf(word.toString());

            } else if (lr != null) {
                List<LatticeEdge> latticeEdges = lr.getEdgesOverSpan(start, end);
                for (LatticeEdge edge : latticeEdges) {
                    IntTaggedWord itw = new IntTaggedWord(edge.word, stateIndex.get(goal), wordIndex, tagIndex);

                    float tagScore = (floodTags) ? -1000.0f : lex.score(itw, start, edge.word, null);
                    if (matches(bestScore, tagScore + (float) edge.weight)) {
                        wordNode = tf.newLeaf(edge.word);
                        if (wordNode.label() instanceof CoreLabel) {
                            CoreLabel cl = (CoreLabel) wordNode.label();
                            cl.setBeginPosition(start);
                            cl.setEndPosition(end);
                        }
                        break;
                    }
                }
                if (wordNode == null) {
                    throw new RuntimeException(
                            "could not find matching word from lattice in parse reconstruction");
                }

            } else {
                throw new RuntimeException("attempt to get word when sentence and lattice are null!");
            }
            Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode));
            tagNode.setScore(bestScore);
            if (originalTags[start] != null) {
                tagNode.label().setValue(originalTags[start].tag());
            }
            return tagNode;
        } else { // normal lexicon is single words case
            IntTaggedWord tagging = new IntTaggedWord(words[start], tagIndex.indexOf(goalStr));
            String contextStr = getCoreLabel(start).originalText();
            float tagScore = lex.score(tagging, start, wordIndex.get(words[start]), contextStr);
            if (tagScore > Float.NEGATIVE_INFINITY || floodTags) {
                // return a pre-terminal tree
                CoreLabel terminalLabel = getCoreLabel(start);

                Tree wordNode = tf.newLeaf(terminalLabel);
                Tree tagNode = tf.newTreeNode(goalStr, Collections.singletonList(wordNode));
                tagNode.setScore(bestScore);
                if (terminalLabel.tag() != null) {
                    tagNode.label().setValue(terminalLabel.tag());
                }
                if (tagNode.label() instanceof HasTag) {
                    ((HasTag) tagNode.label()).setTag(tagNode.label().value());
                }
                return tagNode;
            }
        }
    }
    // check binaries first
    for (int split = start + 1; split < end; split++) {
        for (Iterator<BinaryRule> binaryI = bg.ruleIteratorByParent(goal); binaryI.hasNext();) {
            BinaryRule br = binaryI.next();
            double score = br.score + iScore[start][split][br.leftChild] + iScore[split][end][br.rightChild]
                    + lex.score(br, start, end, split);
            boolean matches;
            if (op.testOptions.lengthNormalization) {
                double normScore = score
                        / (wordsInSpan[start][split][br.leftChild] + wordsInSpan[split][end][br.rightChild]);
                matches = matches(normScore, normBestScore);
            } else {
                matches = matches(score, bestScore);
            }
            if (matches) {
                // build binary split
                Tree leftChildTree = extractBestParse(br.leftChild, start, split);
                Tree rightChildTree = extractBestParse(br.rightChild, split, end);
                List<Tree> children = new ArrayList<Tree>();
                children.add(leftChildTree);
                children.add(rightChildTree);
                Tree result = tf.newTreeNode(goalStr, children);
                result.setScore(score);
                // System.err.println("    Found Binary node: "+result);
                return result;
            }
        }
    }
    // check unaries
    // note that even though we parse with the unary-closed grammar, we can
    // extract the best parse with the non-unary-closed grammar, since all
    // the intermediate states in the chain must have been built, and hence
    // we can exploit the sparser space and reconstruct the full tree as we go.
    // for (Iterator<UnaryRule> unaryI = ug.closedRuleIteratorByParent(goal); unaryI.hasNext(); ) {
    for (Iterator<UnaryRule> unaryI = ug.ruleIteratorByParent(goal); unaryI.hasNext();) {
        UnaryRule ur = unaryI.next();
        // System.err.println("  Trying " + ur + " dtr score: " + iScore[start][end][ur.child]);
        double score = ur.score + iScore[start][end][ur.child] + lex.score(ur, start, end);
        boolean matches;
        if (op.testOptions.lengthNormalization) {
            double normScore = score / wordsInSpan[start][end][ur.child];
            matches = matches(normScore, normBestScore);
        } else {
            matches = matches(score, bestScore);
        }
        if (ur.child != ur.parent && matches) {
            // build unary
            Tree childTree = extractBestParse(ur.child, start, end);
            Tree result = tf.newTreeNode(goalStr, Collections.singletonList(childTree));
            // System.err.println("    Matched!  Unary node: "+result);
            result.setScore(score);
            return result;
        }
    }
    System.err.println("Warning: no parse found in ExhaustivePCFGParser.extractBestParse: failing on: [" + start
            + ", " + end + "] looking for " + goalStr);
    return null;
}

From source file:coreferenceresolver.util.StanfordUtil.java

public void init(boolean simpleInit) throws FileNotFoundException, IOException {
    String outPosFilePath = "./input.txt.pos";
    FileWriter fw = new FileWriter(new File(outPosFilePath));
    BufferedWriter bw = new BufferedWriter(fw);
    props = new Properties();
    if (simpleInit) {
        props.put("annotators", "tokenize, ssplit, pos, parse");
    } else {/*from w  w  w. j  a  va  2  s  .c  o m*/
        props.put("annotators", "tokenize, ssplit, pos, parse, sentiment");
    }
    pipeline = new StanfordCoreNLP(props);

    reviews = new ArrayList<>();

    FileReader fileReader = new FileReader(documentFile);
    BufferedReader bufferedReader = new BufferedReader(fileReader);

    String reviewLine;
    int reviewId = 0;
    int sentenceId;
    //read input file line by line and count the number sentences of each lines
    while ((reviewLine = bufferedReader.readLine()) != null) {
        sentenceId = 0;
        Review newReview = new Review();

        //Add to reviews list
        newReview.setRawContent(reviewLine);

        // create an empty Annotation just with the given text
        document = new Annotation(reviewLine);

        // run all Annotators on this text
        pipeline.annotate(document);
        List<CoreMap> sentences = document.get(SentencesAnnotation.class);

        //Begin extracting from paragraphs
        for (CoreMap sentence : sentences) {
            int sentenceOffsetBegin = sentence.get(CharacterOffsetBeginAnnotation.class);
            int sentenceOffsetEnd = sentence.get(CharacterOffsetEndAnnotation.class);
            Sentence newSentence = new Sentence();
            newSentence.setReviewId(reviewId);
            newSentence.setRawContent(sentence.toString());
            newSentence.setOffsetBegin(sentenceOffsetBegin);
            newSentence.setOffsetEnd(sentenceOffsetEnd);

            if (!simpleInit) {
                int sentimentLevel = RNNCoreAnnotations
                        .getPredictedClass(sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class));
                newSentence.setSentimentLevel(sentimentLevel);

                //Dependency Parsing
                SemanticGraph collCCDeps = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
                Collection<TypedDependency> typedDeps = collCCDeps.typedDependencies();
                newSentence.setDependencies(typedDeps);
            }

            List<Tree> sentenceTreeLeaves = sentence.get(TreeCoreAnnotations.TreeAnnotation.class).getLeaves();

            int i = 0;
            for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
                Token newToken = new Token();

                Tree tokenTree = sentenceTreeLeaves.get(i);
                newToken.setTokenTree(tokenTree);

                String word = token.get(TextAnnotation.class);
                newToken.setWord(word);

                String pos = token.get(PartOfSpeechAnnotation.class);
                newToken.setPOS(pos);

                int offsetBegin = token.get(CharacterOffsetBeginAnnotation.class);
                newToken.setOffsetBegin(offsetBegin);

                int offsetEnd = token.get(CharacterOffsetEndAnnotation.class);
                newToken.setOffsetEnd(offsetEnd);

                if (!simpleInit) {
                    //Check NP relative clause
                    Tree twoLevelsAncestor = tokenTree.ancestor(2,
                            sentence.get(TreeCoreAnnotations.TreeAnnotation.class));
                    if (twoLevelsAncestor.value().equals("WHNP") && !word.toLowerCase().equals("who")
                            && !word.toLowerCase().equals("what")) {
                        newToken.setRelativePronoun(true);
                    }

                    //Calculate sentiment for this token
                    int newTokenSentiment = Util.retrieveOriginalSentiment(newToken.getWord());
                    newToken.setSentimentOrientation(newTokenSentiment, newSentence.getDependencies());
                }

                newSentence.addToken(newToken);
                bw.write(token.word() + "/" + token.tag() + " ");
                ++i;
            }
            bw.newLine();

            if (!simpleInit) {

                //Check if this sentence contains a comparative indicator. 
                //If yes, it is a comparative sentence. Identify which NP is superior or inferior in this sentence
                List<Token> comparativeTokens = FeatureExtractor.findComparativeIndicator(newSentence, null,
                        null);
                //TODO
                //Check special comparative samples
                if (!comparativeTokens.isEmpty()) {
                    newSentence.initComparatives(comparativeTokens);
                }
            }

            newReview.addSentence(newSentence);

            ++sentenceId;
        }

        bw.write("./.");
        bw.newLine();

        reviews.add(newReview);
        ++reviewId;
    }
    bw.close();
}

From source file:count_dep.Count_dep.java

private void Countdependencies() {
    // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution 
    Properties props = new Properties();
    props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    int normal = 0;
    int[] tag = new int[10];
    for (int i = 0; i < events.size(); i++) {
        Event e = events.get(i);//w  ww.  ja  v a 2  s.c  o  m
        // read some text in the text variable
        String text = e.span;// Add your text here!

        // create an empty Annotation just with the given text
        Annotation document = new Annotation(text);

        // run all Annotators on this text
        pipeline.annotate(document);

        // these are all the sentences in this document
        // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
        List<CoreMap> sentences = document.get(SentencesAnnotation.class);
        assert sentences.size() == 1;
        CoreMap sentence = sentences.get(0);

        List<CoreLabel> get = sentence.get(TokensAnnotation.class);
        for (CoreLabel cl : get) {
            if (cl.word().equals(e.trigger)) {
                if (cl.tag().length() >= 2 && "NN".equals(cl.tag().substring(0, 2))) {
                    System.out.println("NN    " + cl.word() + " " + cl.tag() + " " + e.span + " " + e.filename);
                    tag[0]++;
                } else if (cl.tag().length() >= 2 && "VB".equals(cl.tag().substring(0, 2))) {
                    System.out.println("VB    " + cl.word() + " " + cl.tag() + " " + e.span + " " + e.filename);
                    tag[1]++;
                } else if (cl.tag().length() >= 2 && "JJ".equals(cl.tag().substring(0, 2))) {
                    tag[2]++;
                } else if (cl.tag().length() >= 2 && "PR".equals(cl.tag().substring(0, 2))) {
                    tag[3]++;
                } else if (cl.tag().length() >= 2 && "DT".equals(cl.tag().substring(0, 2))) {
                    tag[4]++;
                } else {
                    //  System.out.println(cl.word() + " " + cl.tag()+" "+e.span+" "+e.filename);
                }
            }
        }
        // this is the Stanford dependency graph of the current sentence
        SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);

        LinkedList<Event> Extracted_Events = GetEvents(dependencies, sentence);

        for (Event ee : Extracted_Events) {
            if (ee.trigger.equals(e.trigger)) {
                boolean allmatched = true;
                for (EventArgument s : e.arguments) {
                    boolean ok = false;
                    for (EventArgument s2 : ee.arguments) {
                        if (s.data.contains(s2.data)) {
                            ok = true;
                            break;
                        }
                    }
                    if (!ok) {
                        allmatched = false;
                        break;
                    }
                }
                if (allmatched) {
                    normal++;
                    break;
                } else {
                    System.out.println(e.trigger);
                    System.out.println(e.span + "  " + e.filename);
                    System.out.println("gold slots");
                    for (int goldargu = 0; goldargu < e.arguments.size(); goldargu++) {
                        System.out.print(goldargu + " " + e.arguments.get(goldargu) + "   ");
                    }
                    System.out.println();
                    System.out.println("extracted slots");
                    for (int argu = 0; argu < ee.arguments.size(); argu++) {
                        System.out.print(ee.arguments.get(argu) + "   ");
                    }
                    System.out.println();
                    System.out.println();
                }
            }
        }

        if (i % 50 == 0) {
            System.out.println(1.0 * i / events.size());
        }
    }
    System.out.println(1.0 * normal / events.size());
    //        System.out.println("NN: " + 1.0 * tag[0] / events.size());
    //        System.out.println("VB: " + 1.0 * tag[1] / events.size());
    //        System.out.println("JJ: " + 1.0 * tag[2] / events.size());
    //        System.out.println("PRP: " + 1.0 * tag[3] / events.size());
    //        System.out.println("DT: " + 1.0 * tag[4] / events.size());
}

From source file:count_dep.Count_dep.java

public LinkedList<Event> GetEvents(SemanticGraph dependencies, CoreMap sentence) {
    LinkedList<Event> res = new LinkedList<>();
    LinkedList<IndexedWord> roots = new LinkedList<>();
    List<CoreLabel> words = sentence.get(TokensAnnotation.class);
    List<GrammaticalRelation> senserel = new LinkedList<>();
    senserel.add(GrammaticalRelation.valueOf("nsubj"));
    senserel.add(GrammaticalRelation.valueOf("dobj"));
    for (CoreLabel word : words) {
        if (word.tag().length() >= 2
                && ("VB".equals(word.tag().substring(0, 2)) || "NN".equals(word.tag().substring(0, 2)))) {
            IndexedWord iword = new IndexedWord(word);
            roots.add(iword);//from w  w  w .j av a 2s  .  com
        }
    }
    for (IndexedWord word : roots) {
        Event e = new Event();
        e.trigger = word.word();
        try {
            Set<IndexedWord> children = dependencies.getChildren(word);
            children.stream().forEach((iw) -> {
                e.arguments.add(new EventArgument(iw.word(), ""));
            });
            if (dependencies.inDegree(word) > 0) {
                IndexedWord parent = dependencies.getParent(word);
                if (parent.tag().length() >= 2 && "VB".equals(parent.tag().substring(0, 2))) {
                    Set<IndexedWord> children1 = dependencies.getChildrenWithRelns(parent, senserel);
                    children1.remove(word);
                    children1.stream().forEach((iw) -> {
                        e.arguments.add(new EventArgument(iw.word(), ""));
                    });
                } else {
                    e.arguments.add(new EventArgument(dependencies.getParent(word).word(), ""));
                }
            }
        } catch (java.lang.IllegalArgumentException error) {
            continue;
        }
        res.add(e);
    }
    return res;
}

From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java

License:Open Source License

protected Collection<String> featuresC(PaddedList<IN> cInfo, int loc) {
    CoreLabel p3 = cInfo.get(loc - 3);/*from w  w w.  ja  va 2 s. com*/
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel c = cInfo.get(loc);
    CoreLabel n = cInfo.get(loc + 1);
    CoreLabel n2 = cInfo.get(loc + 2);

    String cWord = getWord(c);
    String pWord = getWord(p);
    String nWord = getWord(n);
    String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class);
    String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class);
    String nShape = n.getString(CoreAnnotations.ShapeAnnotation.class);

    Collection<String> featuresC = new ArrayList<String>();

    if (flags.useDistSim) {
        distSimAnnotate(cInfo);
    }

    if (flags.useBagOfWords) {
        for (IN word : cInfo) {
            featuresC.add(getWord(word) + "-BAGOFWORDS");
        }
    }

    if (flags.useDistSim && flags.useMoreTags) {
        featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + cWord + "-PDISTSIM-CWORD");
    }

    if (flags.useDistSim) {
        featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM");
    }

    if (flags.useTitle) {
        Matcher m = titlePattern.matcher(cWord);
        if (m.matches()) {
            featuresC.add("IS_TITLE");
        }
    }

    if (flags.useInternal && flags.useExternal) {

        if (flags.useWord) {
            featuresC.add(cWord + "-WORD");
        }

        if (flags.use2W) {
            featuresC.add(getWord(p2) + "-P2W");
            featuresC.add(getWord(n2) + "-N2W");
        }

        if (flags.useLC) {
            featuresC.add(cWord.toLowerCase() + "-CL");
            featuresC.add(pWord.toLowerCase() + "-PL");
            featuresC.add(nWord.toLowerCase() + "-NL");
        }

        if (flags.useUnknown) { // for true casing
            featuresC.add(c.get(CoreAnnotations.UnknownAnnotation.class) + "-UNKNOWN");
            featuresC.add(p.get(CoreAnnotations.UnknownAnnotation.class) + "-PUNKNOWN");
            featuresC.add(n.get(CoreAnnotations.UnknownAnnotation.class) + "-NUNKNOWN");
        }

        if (flags.useLemmas) {
            String lem = c.getString(CoreAnnotations.LemmaAnnotation.class);
            if (!"".equals(lem)) {
                featuresC.add(lem + "-LEM");
            }
        }
        if (flags.usePrevNextLemmas) {
            String plem = p.getString(CoreAnnotations.LemmaAnnotation.class);
            String nlem = n.getString(CoreAnnotations.LemmaAnnotation.class);
            if (!"".equals(plem)) {
                featuresC.add(plem + "-PLEM");
            }
            if (!"".equals(nlem)) {
                featuresC.add(nlem + "-NLEM");
            }
        }

        if (flags.checkNameList) {
            try {
                if (lastNames == null) {
                    lastNames = Generics.newHashSet();

                    for (String line : ObjectBank.getLineIterator(flags.lastNameList)) {
                        String[] cols = line.split("\\s+");
                        lastNames.add(cols[0]);
                    }
                }
                if (maleNames == null) {
                    maleNames = Generics.newHashSet();
                    for (String line : ObjectBank.getLineIterator(flags.maleNameList)) {
                        String[] cols = line.split("\\s+");
                        maleNames.add(cols[0]);
                    }
                }
                if (femaleNames == null) {
                    femaleNames = Generics.newHashSet();
                    for (String line : ObjectBank.getLineIterator(flags.femaleNameList)) {
                        String[] cols = line.split("\\s+");
                        femaleNames.add(cols[0]);
                    }
                }

                String name = cWord.toUpperCase();
                if (lastNames.contains(name)) {
                    featuresC.add("LAST_NAME");
                }

                if (maleNames.contains(name)) {
                    featuresC.add("MALE_NAME");
                }

                if (femaleNames.contains(name)) {
                    featuresC.add("FEMALE_NAME");
                }

            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }

        if (flags.binnedLengths != null) {
            int len = cWord.length();
            String featureName = null;
            for (int i = 0; i <= flags.binnedLengths.length; i++) {
                if (i == flags.binnedLengths.length) {
                    featureName = "Len-" + flags.binnedLengths[flags.binnedLengths.length - 1] + "-Inf";
                } else if (len <= flags.binnedLengths[i]) {
                    featureName = "Len-" + ((i == 0) ? 1 : flags.binnedLengths[i - 1]) + '-'
                            + flags.binnedLengths[i];
                    break;
                }
            }
            featuresC.add(featureName);
        }

        if (flags.useABGENE) {
            featuresC.add(c.get(CoreAnnotations.AbgeneAnnotation.class) + "-ABGENE");
            featuresC.add(p.get(CoreAnnotations.AbgeneAnnotation.class) + "-PABGENE");
            featuresC.add(n.get(CoreAnnotations.AbgeneAnnotation.class) + "-NABGENE");
        }

        if (flags.useABSTRFreqDict) {
            featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT"
                    + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"
                    + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
            featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT"
                    + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT"
                    + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
            featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT"
                    + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT"
                    + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"
                    + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
        }

        if (flags.useABSTR) {
            featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT");
            featuresC.add(p.get(CoreAnnotations.AbstrAnnotation.class) + "-PABSTRACT");
            featuresC.add(n.get(CoreAnnotations.AbstrAnnotation.class) + "-NABSTRACT");
        }

        if (flags.useGENIA) {
            featuresC.add(c.get(CoreAnnotations.GeniaAnnotation.class) + "-GENIA");
            featuresC.add(p.get(CoreAnnotations.GeniaAnnotation.class) + "-PGENIA");
            featuresC.add(n.get(CoreAnnotations.GeniaAnnotation.class) + "-NGENIA");
        }
        if (flags.useWEBFreqDict) {
            featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB"
                    + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"
                    + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
            featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB"
                    + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT"
                    + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
            featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB"
                    + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT"
                    + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"
                    + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
        }

        if (flags.useWEB) {
            featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB");
            featuresC.add(p.get(CoreAnnotations.WebAnnotation.class) + "-PWEB");
            featuresC.add(n.get(CoreAnnotations.WebAnnotation.class) + "-NWEB");
        }

        if (flags.useIsURL) {
            featuresC.add(c.get(CoreAnnotations.IsURLAnnotation.class) + "-ISURL");
        }
        if (flags.useEntityRule) {
            featuresC.add(c.get(CoreAnnotations.EntityRuleAnnotation.class) + "-ENTITYRULE");
        }
        if (flags.useEntityTypes) {
            featuresC.add(c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ENTITYTYPE");
        }
        if (flags.useIsDateRange) {
            featuresC.add(c.get(CoreAnnotations.IsDateRangeAnnotation.class) + "-ISDATERANGE");
        }

        if (flags.useABSTRFreq) {
            featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT"
                    + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ");
        }

        if (flags.useFREQ) {
            featuresC.add(c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ");
        }

        if (flags.useMoreTags) {
            featuresC.add(
                    p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + cWord + "-PTAG-CWORD");
        }

        if (flags.usePosition) {
            featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + "-POSITION");
        }
        if (flags.useBeginSent) {
            String pos = c.get(CoreAnnotations.PositionAnnotation.class);
            if ("0".equals(pos)) {
                featuresC.add("BEGIN-SENT");
                featuresC.add(cShape + "-BEGIN-SENT");
            } else if (Integer.toString(cInfo.size() - 1).equals(pos)) {
                featuresC.add("END-SENT");
                featuresC.add(cShape + "-END-SENT");
            } else {
                featuresC.add("IN-SENT");
                featuresC.add(cShape + "-IN-SENT");
            }
        }
        if (flags.useTags) {
            featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
        }

        if (flags.useOrdinal) {
            if (isOrdinal(cInfo, loc)) {
                featuresC.add("C_ORDINAL");
                if (isOrdinal(cInfo, loc - 1)) {
                    //System.err.print(getWord(p) + " ");
                    featuresC.add("PC_ORDINAL");
                }
                //System.err.println(cWord);
            }
            if (isOrdinal(cInfo, loc - 1)) {
                featuresC.add("P_ORDINAL");
            }
        }

        if (flags.usePrev) {
            featuresC.add(pWord + "-PW");
            if (flags.useTags) {
                featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PTAG");
            }
            if (flags.useDistSim) {
                featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + "-PDISTSIM");
            }
            if (flags.useIsURL) {
                featuresC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + "-PISURL");
            }
            if (flags.useEntityTypes) {
                featuresC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + "-PENTITYTYPE");
            }
        }

        if (flags.useNext) {
            featuresC.add(nWord + "-NW");
            if (flags.useTags) {
                featuresC.add(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-NTAG");
            }
            if (flags.useDistSim) {
                featuresC.add(n.get(CoreAnnotations.DistSimAnnotation.class) + "-NDISTSIM");
            }
            if (flags.useIsURL) {
                featuresC.add(n.get(CoreAnnotations.IsURLAnnotation.class) + "-NISURL");
            }
            if (flags.useEntityTypes) {
                featuresC.add(n.get(CoreAnnotations.EntityTypeAnnotation.class) + "-NENTITYTYPE");
            }
        }
        /*here, entityTypes refers to the type in the PASCAL IE challenge:
         * i.e. certain words are tagged "Date" or "Location" */

        if (flags.useEitherSideWord) {
            featuresC.add(pWord + "-EW");
            featuresC.add(nWord + "-EW");
        }

        if (flags.useWordPairs) {
            featuresC.add(cWord + '-' + pWord + "-W-PW");
            featuresC.add(cWord + '-' + nWord + "-W-NW");
        }

        if (flags.useSymTags) {
            if (flags.useTags) {
                featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCNTAGS");
                featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-CNTAGS");
                featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCTAGS");
            }
            if (flags.useDistSim) {
                featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + n.get(CoreAnnotations.DistSimAnnotation.class) + "-PCNDISTSIM");
                featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + n.get(CoreAnnotations.DistSimAnnotation.class) + "-CNDISTSIM");
                featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-PCDISTSIM");
            }

        }

        if (flags.useSymWordPairs) {
            featuresC.add(pWord + '-' + nWord + "-SWORDS");
        }

        String pGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures)
                ? p.get(CoreAnnotations.GazAnnotation.class)
                : null;
        String nGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures)
                ? n.get(CoreAnnotations.GazAnnotation.class)
                : null;
        String cGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures)
                ? c.get(CoreAnnotations.GazAnnotation.class)
                : null;
        if (flags.useGazFeatures) {

            if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) {
                featuresC.add(cGazAnnotation + "-GAZ");
            }
            // n
            if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) {
                featuresC.add(nGazAnnotation + "-NGAZ");
            }
            // p
            if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) {
                featuresC.add(pGazAnnotation + "-PGAZ");
            }
        }

        if (flags.useMoreGazFeatures) {
            if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) {
                featuresC.add(cGazAnnotation + '-' + cWord + "-CG-CW-GAZ");

                // c-n
                if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) {
                    featuresC.add(cGazAnnotation + '-' + nGazAnnotation + "-CNGAZ");
                }

                // p-c
                if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) {
                    featuresC.add(pGazAnnotation + '-' + cGazAnnotation + "-PCGAZ");
                }
            }
        }

        if (flags.useAbbr || flags.useMinimalAbbr) {
            featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR");
        }

        if (flags.useAbbr1 || flags.useMinimalAbbr1) {
            if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
                featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR");
            }
        }

        if (flags.useAbbr) {
            featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                    + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR");
            featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                    + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR");
            featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                    + c.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                    + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR");
        }

        if (flags.useAbbr1) {
            if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
                featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                        + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR");
                featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                        + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR");
                featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                        + c.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                        + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR");
            }
        }

        if (flags.useChunks) {
            featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-'
                    + c.get(CoreAnnotations.ChunkAnnotation.class) + "-PCCHUNK");
            featuresC.add(c.get(CoreAnnotations.ChunkAnnotation.class) + '-'
                    + n.get(CoreAnnotations.ChunkAnnotation.class) + "-CNCHUNK");
            featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-'
                    + c.get(CoreAnnotations.ChunkAnnotation.class) + '-'
                    + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK");
        }

        if (flags.useMinimalAbbr) {
            featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB");
        }

        if (flags.useMinimalAbbr1) {
            if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
                featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB");
            }
        }

        String prevVB = "", nextVB = "";
        if (flags.usePrevVB) {
            for (int j = loc - 1;; j--) {
                CoreLabel wi = cInfo.get(j);
                if (wi == cInfo.getPad()) {
                    prevVB = "X";
                    featuresC.add("X-PVB");
                    break;
                } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) {
                    featuresC.add(getWord(wi) + "-PVB");
                    prevVB = getWord(wi);
                    break;
                }
            }
        }

        if (flags.useNextVB) {
            for (int j = loc + 1;; j++) {
                CoreLabel wi = cInfo.get(j);
                if (wi == cInfo.getPad()) {
                    featuresC.add("X-NVB");
                    nextVB = "X";
                    break;
                } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) {
                    featuresC.add(getWord(wi) + "-NVB");
                    nextVB = getWord(wi);
                    break;
                }
            }
        }

        if (flags.useVB) {
            featuresC.add(prevVB + '-' + nextVB + "-PNVB");
        }

        if (flags.useShapeConjunctions) {
            featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + cShape + "-POS-SH");
            if (flags.useTags) {
                featuresC.add(c.tag() + cShape + "-TAG-SH");
            }
            if (flags.useDistSim) {
                featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + cShape + "-DISTSIM-SH");
            }

        }

        if (flags.useWordTag) {
            featuresC.add(cWord + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-T");
            featuresC.add(cWord + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-PT");
            featuresC.add(cWord + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-NT");
        }

        if (flags.useNPHead) {
            featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-HW");
            if (flags.useTags) {
                featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-"
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-HW-T");
            }
            if (flags.useDistSim) {
                featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-"
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-HW-DISTSIM");
            }
        }

        if (flags.useNPGovernor) {
            featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + "-GW");
            if (flags.useTags) {
                featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-GW-T");
            }
            if (flags.useDistSim) {
                featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM-T1");
            }
        }

        if (flags.useHeadGov) {
            featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-"
                    + c.get(CoreAnnotations.GovernorAnnotation.class) + "-HW_GW");
        }

        if (flags.useClassFeature) {
            featuresC.add("###");
        }

        if (flags.useFirstWord) {
            String firstWord = getWord(cInfo.get(0));
            featuresC.add(firstWord);
        }

        if (flags.useNGrams) {
            Collection<String> subs = null;
            if (flags.cacheNGrams) {
                subs = wordToSubstrings.get(cWord);
            }
            if (subs == null) {
                subs = new ArrayList<String>();
                String word = '<' + cWord + '>';
                if (flags.lowercaseNGrams) {
                    word = word.toLowerCase();
                }
                if (flags.dehyphenateNGrams) {
                    word = dehyphenate(word);
                }
                if (flags.greekifyNGrams) {
                    word = greekify(word);
                }
                // minimum length substring is 2 letters (hardwired)
                // hoist flags.noMidNGrams so only linear in word length for that case
                if (flags.noMidNGrams) {
                    int max = flags.maxNGramLeng >= 0 ? Math.min(flags.maxNGramLeng, word.length())
                            : word.length();
                    for (int j = 2; j <= max; j++) {
                        subs.add(intern('#' + word.substring(0, j) + '#'));
                    }
                    int start = flags.maxNGramLeng >= 0 ? Math.max(0, word.length() - flags.maxNGramLeng) : 0;
                    int lenM1 = word.length() - 1;
                    for (int i = start; i < lenM1; i++) {
                        subs.add(intern('#' + word.substring(i) + '#'));
                    }
                } else {
                    for (int i = 0; i < word.length(); i++) {
                        for (int j = i + 2, max = Math.min(word.length(),
                                i + flags.maxNGramLeng); j <= max; j++) {
                            if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
                                continue;
                            }
                            subs.add(intern('#' + word.substring(i, j) + '#'));
                        }
                    }
                }
                if (flags.cacheNGrams) {
                    wordToSubstrings.put(cWord, subs);
                }
            }
            featuresC.addAll(subs);
            if (flags.conjoinShapeNGrams) {
                for (String str : subs) {
                    String feat = str + '-' + cShape + "-CNGram-CS";
                    featuresC.add(feat);
                }
            }
        }

        if (flags.useGazettes) {
            if (flags.sloppyGazette) {
                Collection<String> entries = wordToGazetteEntries.get(cWord);
                if (entries != null) {
                    featuresC.addAll(entries);
                }
            }
            if (flags.cleanGazette) {
                Collection<GazetteInfo> infos = wordToGazetteInfos.get(cWord);
                if (infos != null) {
                    for (GazetteInfo gInfo : infos) {
                        boolean ok = true;
                        for (int gLoc = 0; gLoc < gInfo.words.length; gLoc++) {
                            ok &= gInfo.words[gLoc].equals(getWord(cInfo.get(loc + gLoc - gInfo.loc)));
                        }
                        if (ok) {
                            featuresC.add(gInfo.feature);
                        }
                    }
                }
            }
        }

        if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
            featuresC.add(cShape + "-TYPE");
            if (flags.useTypeSeqs) {
                featuresC.add(pShape + "-PTYPE");
                featuresC.add(nShape + "-NTYPE");
                featuresC.add(pWord + "..." + cShape + "-PW_CTYPE");
                featuresC.add(cShape + "..." + nWord + "-NW_CTYPE");
                featuresC.add(pShape + "..." + cShape + "-PCTYPE");
                featuresC.add(cShape + "..." + nShape + "-CNTYPE");
                featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE");
            }
        }

        if (flags.useLastRealWord) {
            if (pWord.length() <= 3) {
                // extending this to check for 2 short words doesn't seem to help....
                featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE");
            }
        }

        if (flags.useNextRealWord) {
            if (nWord.length() <= 3) {
                // extending this to check for 2 short words doesn't seem to help....
                featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE");
            }
        }

        if (flags.useOccurrencePatterns) {
            featuresC.addAll(occurrencePatterns(cInfo, loc));
        }

        if (flags.useDisjunctive) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                CoreLabel dn = cInfo.get(loc + i);
                CoreLabel dp = cInfo.get(loc - i);
                featuresC.add(getWord(dn) + "-DISJN");
                if (flags.useDisjunctiveShapeInteraction) {
                    featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS");
                }
                featuresC.add(getWord(dp) + "-DISJP");
                if (flags.useDisjunctiveShapeInteraction) {
                    featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS");
                }
            }
        }

        if (flags.useWideDisjunctive) {
            for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
                featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN");
                featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP");
            }
        }

        if (flags.useEitherSideDisjunctive) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWE");
                featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWE");
            }
        }

        if (flags.useDisjShape) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE");
                // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE");
                featuresC.add(cShape + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class)
                        + "-CNDISJSHAPE");
                // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE");
            }
        }

        if (flags.useExtraTaggySequences) {
            if (flags.useTags) {
                featuresC.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS");
                featuresC.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS");
            }
            if (flags.useDistSim) {
                featuresC.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1");
                featuresC.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1");
            }
        }

        if (flags.useMUCFeatures) {
            featuresC.add(c.get(CoreAnnotations.SectionAnnotation.class) + "-SECTION");
            featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class) + "-WORD_POSITION");
            featuresC.add(c.get(CoreAnnotations.SentencePositionAnnotation.class) + "-SENT_POSITION");
            featuresC.add(c.get(CoreAnnotations.ParaPositionAnnotation.class) + "-PARA_POSITION");
            featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class) + '-'
                    + c.get(CoreAnnotations.ShapeAnnotation.class) + "-WORD_POSITION_SHAPE");
        }
    } else if (flags.useInternal) {

        if (flags.useWord) {
            featuresC.add(cWord + "-WORD");
        }

        if (flags.useNGrams) {
            Collection<String> subs = wordToSubstrings.get(cWord);
            if (subs == null) {
                subs = new ArrayList<String>();
                String word = '<' + cWord + '>';
                if (flags.lowercaseNGrams) {
                    word = word.toLowerCase();
                }
                if (flags.dehyphenateNGrams) {
                    word = dehyphenate(word);
                }
                if (flags.greekifyNGrams) {
                    word = greekify(word);
                }
                for (int i = 0; i < word.length(); i++) {
                    for (int j = i + 2; j <= word.length(); j++) {
                        if (flags.noMidNGrams && i != 0 && j != word.length()) {
                            continue;
                        }
                        if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
                            continue;
                        }
                        //subs.add(intern("#" + word.substring(i, j) + "#"));
                        subs.add(intern('#' + word.substring(i, j) + '#'));
                    }
                }
                if (flags.cacheNGrams) {
                    wordToSubstrings.put(cWord, subs);
                }
            }
            featuresC.addAll(subs);
            if (flags.conjoinShapeNGrams) {
                String shape = c.get(CoreAnnotations.ShapeAnnotation.class);
                for (String str : subs) {
                    String feat = str + '-' + shape + "-CNGram-CS";
                    featuresC.add(feat);
                }
            }
        }

        if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
            featuresC.add(cShape + "-TYPE");
        }

        if (flags.useOccurrencePatterns) {
            featuresC.addAll(occurrencePatterns(cInfo, loc));
        }

    } else if (flags.useExternal) {

        if (flags.usePrev) {
            featuresC.add(pWord + "-PW");
        }

        if (flags.useNext) {
            featuresC.add(nWord + "-NW");
        }

        if (flags.useWordPairs) {
            featuresC.add(cWord + '-' + pWord + "-W-PW");
            featuresC.add(cWord + '-' + nWord + "-W-NW");
        }

        if (flags.useSymWordPairs) {
            featuresC.add(pWord + '-' + nWord + "-SWORDS");
        }

        if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
            if (flags.useTypeSeqs) {
                featuresC.add(pShape + "-PTYPE");
                featuresC.add(nShape + "-NTYPE");
                featuresC.add(pWord + "..." + cShape + "-PW_CTYPE");
                featuresC.add(cShape + "..." + nWord + "-NW_CTYPE");
                if (flags.maxLeft > 0)
                    featuresC.add(pShape + "..." + cShape + "-PCTYPE"); // this one just isn't useful, at least given c,pc,s,ps.  Might be useful 0th-order
                featuresC.add(cShape + "..." + nShape + "-CNTYPE");
                featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE");
            }
        }

        if (flags.useLastRealWord) {
            if (pWord.length() <= 3) {
                featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE");
            }
        }

        if (flags.useNextRealWord) {
            if (nWord.length() <= 3) {
                featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE");
            }
        }

        if (flags.useDisjunctive) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                CoreLabel dn = cInfo.get(loc + i);
                CoreLabel dp = cInfo.get(loc - i);
                featuresC.add(getWord(dn) + "-DISJN");
                if (flags.useDisjunctiveShapeInteraction) {
                    featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS");
                }
                featuresC.add(getWord(dp) + "-DISJP");
                if (flags.useDisjunctiveShapeInteraction) {
                    featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS");
                }
            }
        }

        if (flags.useWideDisjunctive) {
            for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
                featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN");
                featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP");
            }
        }

        if (flags.useDisjShape) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE");
                // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE");
                featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + '-'
                        + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE");
                // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE");
            }
        }

    }

    // Stuff to add binary features from the additional columns
    if (flags.twoStage) {
        featuresC.add(c.get(Bin1Annotation.class) + "-BIN1");
        featuresC.add(c.get(Bin2Annotation.class) + "-BIN2");
        featuresC.add(c.get(Bin3Annotation.class) + "-BIN3");
        featuresC.add(c.get(Bin4Annotation.class) + "-BIN4");
        featuresC.add(c.get(Bin5Annotation.class) + "-BIN5");
        featuresC.add(c.get(Bin6Annotation.class) + "-BIN6");
    }

    if (flags.useIfInteger) {
        try {
            int val = Integer.parseInt(cWord);
            if (val > 0)
                featuresC.add("POSITIVE_INTEGER");
            else if (val < 0)
                featuresC.add("NEGATIVE_INTEGER");
            // System.err.println("FOUND INTEGER");
        } catch (NumberFormatException e) {
            // not an integer value, nothing to do
        }
    }

    //Stuff to add arbitrary features
    if (flags.useGenericFeatures) {
        //see if we need to cache the keys
        if (genericAnnotationKeys == null) {
            makeGenericKeyCache(c);
        }
        //now look through the cached keys
        for (Class key : genericAnnotationKeys) {
            //System.err.println("Adding feature: " + CoreLabel.genericValues.get(key) + " with value " + c.get(key));
            if (c.get(key) != null && c.get(key) instanceof Collection) {
                for (Object ob : (Collection) c.get(key)) {
                    featuresC.add(ob + "-" + CoreLabel.genericValues.get(key));
                }
            } else {
                featuresC.add(c.get(key) + "-" + CoreLabel.genericValues.get(key));
            }
        }
    }

    if (flags.useTopics) {
        //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + cWord + "--CWORD");
        featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + "-TopicID");
        featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + "-PTopicID");
        featuresC.add(n.get(CoreAnnotations.TopicAnnotation.class) + "-NTopicID");
        //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-PCNTopicID");
        //featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-CNTopicID");
        //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + "-PCTopicID");
        //featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + cShape + "-TopicID-SH");
        //asdasd
    }

    // NER tag annotations from a previous NER system
    if (c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) != null) {
        featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CStackedNERTag");
        featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)
                + "-WCStackedNERTag");

        if (flags.useNext) {
            featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-'
                    + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CNStackedNERTag");
            featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-'
                    + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-WCNStackedNERTag");

            if (flags.usePrev) {
                featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-'
                        + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-'
                        + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCNStackedNERTag");
                featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + cWord + " -"
                        + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-'
                        + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PWCNStackedNERTag");
            }
        }
        if (flags.usePrev) {
            featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-'
                    + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCStackedNERTag");
        }
    }
    if (flags.useWordnetFeatures)
        featuresC.add(c.get(CoreAnnotations.WordnetSynAnnotation.class) + "-WordnetSyn");
    if (flags.useProtoFeatures)
        featuresC.add(c.get(CoreAnnotations.ProtoAnnotation.class) + "-Proto");
    if (flags.usePhraseWordTags)
        featuresC.add(c.get(CoreAnnotations.PhraseWordsTagAnnotation.class) + "-PhraseTag");
    if (flags.usePhraseWords) {
        for (String w : c.get(CoreAnnotations.PhraseWordsAnnotation.class))
            featuresC.add(w + "-PhraseWord");
    }
    if (flags.useCommonWordsFeature)
        featuresC.add(c.get(CoreAnnotations.CommonWordsAnnotation.class));

    if (flags.useRadical && cWord.length() > 0) {
        if (cWord.length() == 1) {
            featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + "-SINGLE-CHAR-RADICAL");
        } else {
            featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + "-START-RADICAL");
            featuresC.add(RadicalMap.getRadical(cWord.charAt(cWord.length() - 1)) + "-END-RADICAL");
        }
        for (int i = 0; i < cWord.length(); ++i) {
            featuresC.add(RadicalMap.getRadical(cWord.charAt(i)) + "-RADICAL");
        }
    }

    if (flags.splitWordRegex != null && !flags.splitWordRegex.isEmpty()) {
        String[] ws = c.word().split(flags.splitWordRegex);
        for (String s : ws) {
            featuresC.add(s + "-SPLITWORD");
        }
    }
    return featuresC;
}

From source file:edu.cuhk.hccl.util.NLPUtil.java

License:Apache License

public static ArrayList<String[]> extractNounPhrases(StanfordCoreNLP pipeline, String text, int searchRange) {
    ArrayList<String[]> wordPairs = new ArrayList<String[]>();
    Annotation document = new Annotation(text);
    pipeline.annotate(document);//  w ww.j a v  a2 s  . co m
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    MAX_STEPS = searchRange;

    for (CoreMap sentence : sentences) {
        List<CoreLabel> labels = sentence.get(TokensAnnotation.class);

        // Check negation
        boolean hasNegation = false;
        for (CoreLabel label : labels) {
            if (NEGATIONS.contains(label.lemma().toLowerCase())) {
                hasNegation = true;
            }
        }

        for (int idx = 0; idx < labels.size(); idx++) {
            CoreLabel label = labels.get(idx);
            if (NN_TAGS.contains(label.get(PartOfSpeechAnnotation.class))) {
                for (int step = 1; step <= MAX_STEPS; step++) {
                    CoreLabel leftLabel = labels.get(Math.max(0, idx - step));
                    if (JJ_TAGS.contains(leftLabel.tag())) {
                        if (hasNegation)
                            addPair(wordPairs, NOT_PREFIX + leftLabel.get(LemmaAnnotation.class),
                                    label.get(LemmaAnnotation.class));
                        else
                            addPair(wordPairs, leftLabel.get(LemmaAnnotation.class),
                                    label.get(LemmaAnnotation.class));
                        break;
                    }
                    CoreLabel rightLabel = labels.get(Math.min(idx + step, labels.size() - 1));
                    if (JJ_TAGS.contains(rightLabel.tag())) {
                        if (hasNegation)
                            addPair(wordPairs, NOT_PREFIX + rightLabel.get(LemmaAnnotation.class),
                                    label.get(LemmaAnnotation.class));
                        else
                            addPair(wordPairs, rightLabel.get(LemmaAnnotation.class),
                                    label.get(LemmaAnnotation.class));

                        break;
                    }
                }
            }
        }
    }
    return wordPairs;
}

From source file:edu.ucla.cs.scai.aztec.ir.tokenization.WordTokenizer.java

License:Apache License

public WordTokenizedDocument tokenize(String text, boolean lemmatize, boolean removeStopWords,
        boolean toLowerCase) {
    WordTokenizedDocument res = new WordTokenizedDocument();
    Properties propsTokens = new Properties();
    propsTokens.put("annotators", "tokenize, ssplit, pos, lemma, ner, regexner");
    StanfordCoreNLP pipelineTokens = new StanfordCoreNLP(propsTokens);
    Annotation qaTokens = new Annotation(text);
    pipelineTokens.annotate(qaTokens);//from w w w . ja  v  a2  s  . c o m
    List<CoreMap> sentences = qaTokens.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        WordTokenizedSentence s = new WordTokenizedSentence();
        for (CoreLabel cl : (ArrayList<CoreLabel>) sentence.get(CoreAnnotations.TokensAnnotation.class)) {
            if (!removeStopWords || !stopwords.contains(cl.lemma())) {
                WordToken t = new WordToken(cl.word(), cl.lemma(), cl.tag());
                if (lemmatize) {
                    t.useLemma();
                }
                if (toLowerCase) {
                    t.useLowerCase();
                }
                s.appendToken(t);
            }
        }
        res.appendSentence(s);
    }
    return res;
}

From source file:ims.cs.corenlp.Helper.java

License:Open Source License

/**
 * Checks whether a CoreNLP token is a quote
 * @param token//from w  w w. j a  v  a  2  s.c o m
 * @return
 */
public static boolean isQuote(CoreLabel token) {
    // single quotes are mostly wrong and unhelpful, so ignore!!!
    return (token.tag().equals("\"") || token.tag().equals("``") || token.tag().equals("''")
            || token.tag().equals("QUOT")) && !token.tag().equals("'") && !token.tag().equals("`");
}

From source file:ims.cs.corenlp.TokenAligner.java

License:Open Source License

/**
 * Combines my token and a CoreNlp token using predicted information
 * @param tok//w  ww .j  ava2  s.c om
 * @param cl
 * @param currentCoreNlpSentenceIndex
 * @return
 */
public static Token combineTokensPred(Token tok, CoreLabel cl, int currentCoreNlpSentenceIndex) {
    Token combined = new Token(tok);
    combined.predText = cl.word();
    combined.predLemma = cl.lemma();
    combined.predPosition = -1; /* will be determined by document aligner */
    combined.predPosTag = cl.tag();
    combined.predSentencePosition = currentCoreNlpSentenceIndex;
    combined.predNer = Helper.translateNer(cl.ner());
    combined.predByteCount = new ByteCount(cl.beginPosition(), cl.endPosition());
    return combined;
}