Example usage for edu.stanford.nlp.ling CoreLabel getString

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling CoreLabel getString.

Prototype

@Override
public <KEY extends Key<String>> String getString(Class<KEY> key)

Source Link

Usage

From source file:com.panot.JavaCoref.MyMUCMentionExtractor.java

License:Open Source License

@Override
public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)",
            Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset))
        return null;

    currentOffset = docMatcher.end();/* www  .  j a  v  a  2  s. c om*/
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    //Maintain current document ID.
    Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find())
        currentDocumentID = docIDMatcher.group(1);
    else
        currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
        String sentenceString = sentenceMatcher.group(2);
        List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString), "invertible")
                .tokenize();

        // FIXING TOKENIZATION PROBLEMS
        for (int i = 0; i < words.size(); i++) {
            CoreLabel w = words.get(i);
            if (i > 0 && w.word().equals("$")) {
                if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
                    continue;
                words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
                words.remove(i);
                i--;
            } else if (w.word().equals("\\/")) {
                if (words.get(i - 1).word().equals("</COREF>"))
                    continue;
                w.set(CoreAnnotations.TextAnnotation.class,
                        words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
                words.remove(i + 1);
                words.remove(i - 1);
            }
        }
        // END FIXING TOKENIZATION PROBLEMS

        List<CoreLabel> sentence = new ArrayList<CoreLabel>();
        // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open
        Stack<Mention> stack = new Stack<Mention>();
        List<Mention> mentions = new ArrayList<Mention>();

        allWords.add(sentence);
        allGoldMentions.add(mentions);

        for (CoreLabel word : words) {
            String w = word.get(CoreAnnotations.TextAnnotation.class);
            // found regular token: WORD/POS
            if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
                int i = w.lastIndexOf("\\/");
                String w1 = w.substring(0, i);
                // we do NOT set POS info here. We take the POS tags from the parser!
                word.set(CoreAnnotations.TextAnnotation.class, w1);
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
            // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
            else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
                Pattern nerPattern = Pattern.compile("<(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                ner = m.group(1);
            }
            // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
            else if (w.startsWith("</") && !w.startsWith("</COREF")) {
                Pattern nerPattern = Pattern.compile("</(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                String ner1 = m.group(1);
                if (ner != null && !ner.equals(ner1))
                    throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
                ner = null;
            }
            // found the start SGML tag for a coref mention
            else if (w.startsWith("<COREF")) {
                Mention mention = new Mention();
                // position of this mention in the sentence
                mention.startIndex = sentence.size();

                // extract GOLD info about this coref chain. needed for eval
                Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
                Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

                Matcher m = idPattern.matcher(w);
                m.find();
                mention.mentionID = Integer.valueOf(m.group(1));

                m = refPattern.matcher(w);
                if (m.find()) {
                    mention.originalRef = Integer.valueOf(m.group(1));
                }

                // open mention. keep track of all open mentions using the stack
                stack.push(mention);
            }
            // found the end SGML tag for a coref mention
            else if (w.equals("</COREF>")) {
                Mention mention = stack.pop();
                mention.endIndex = sentence.size();

                // this is a closed mention. add it to the final list of mentions
                // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef);
                mentions.add(mention);
            } else {
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
        }
        StringBuilder textContent = new StringBuilder();
        for (int i = 0; i < sentence.size(); i++) {
            CoreLabel w = sentence.get(i);
            w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
            w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
            if (i > 0)
                textContent.append(" ");
            textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
        }
        CoreMap sentCoreMap = new Annotation(textContent.toString());
        allSentences.add(sentCoreMap);
        sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            idMention.put(m.mentionID, m);
        }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            if (m.goldCorefClusterID == -1) {
                if (m.originalRef == -1)
                    m.goldCorefClusterID = m.mentionID;
                else {
                    int ref = m.originalRef;
                    while (true) {
                        Mention m2 = idMention.get(ref);
                        if (m2.goldCorefClusterID != -1) {
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else if (m2.originalRef == -1) {
                            m2.goldCorefClusterID = m2.mentionID;
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else {
                            ref = m2.originalRef;
                        }
                    }
                }
            }
        }
    }

    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);

    if (allSentences.size() != allWords.size())
        throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
        List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
        List<CoreLabel> unannotatedSent = allWords.get(i);
        List<Mention> mentionInSent = allGoldMentions.get(i);
        for (Mention m : mentionInSent) {
            m.dependency = allSentences.get(i)
                    .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        }
        if (annotatedSent.size() != unannotatedSent.size()) {
            throw new IllegalStateException("annotatedSent != unannotatedSent");
        }
        for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
            CoreLabel annotatedWord = annotatedSent.get(j);
            CoreLabel unannotatedWord = unannotatedSent.get(j);
            if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class)
                    .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
                throw new IllegalStateException("annotatedWord != unannotatedWord");
            }
        }
        allWords.set(i, annotatedSent);
        allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // term things

    List<List<Mention>> termMentions = new ArrayList<List<Mention>>();

    if (use_term) {
        String dataCrf = "";
        System.err.print("FEAT TYPE: ");
        System.err
                .println(props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP));
        if (props.getProperty(MyConstants.TTE_FEATURE_GENERATOR, MyConstants.TTE_FEATURE_CORENLP)
                .equals(MyConstants.TTE_FEATURE_NLTK)) {
            dataCrf = NltkCrfFormatter.annotationToCrfString(docAnno);
        } else {
            dataCrf = CrfFormatter.annotationToCrfString(docAnno);
        }
        List<List<String>> tagResult = new ArrayList<List<String>>();

        try {
            tagResult = CrfsuiteCaller.tag(dataCrf, props.getProperty(MyConstants.TTE_MODEL));

            if (props.containsKey(MyConstants.TTE_SAVE_CRF_DATA)) {
                String crfDataFilename = props.getProperty(MyConstants.TTE_SAVE_CRF_DATA);

                File crfDataFile = new File(crfDataFilename);
                BufferedWriter bw = new BufferedWriter(new FileWriter(crfDataFile));
                bw.write(dataCrf);
                bw.close();
            }

        } catch (Exception e) {
            System.err.println("Crfsuite tag failed");
        }

        termAsMentionFinder.setTags(tagResult);
        termMentions = termAsMentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

        maxID = termAsMentionFinder.getMaxID();
    }

    // extract predicted mentions

    allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

    if (use_term && props.containsKey(MyConstants.TTE_KEEP_PRON)) {
        termMentions = injectPronoun(termMentions, allPredictedMentions);
    }

    if (experimentType != null) {
        if (experimentType.equals(MyConstants.EXP_TYPE_03_UNION)) {
            List<List<Mention>> usingMentions = unionMentions(allPredictedMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (experimentType.equals(MyConstants.EXP_TYPE_03_INTERSECT)) {
            List<List<Mention>> usingMentions = intersectMentions(allPredictedMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_CHECK)) {
            allPredictedMentions = termMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_SUPER)) {
            List<List<Mention>> usingMentions = superstringMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_OVERLAP)) {
            List<List<Mention>> usingMentions = overlapMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_04_UNION)) {
            List<List<Mention>> usingMentions = unionMentions(termMentions, allPredictedMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_SUPER)) {
            List<List<Mention>> usingMentions = superstringMentions(termMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else if (use_term && experimentType.equals(MyConstants.EXP_TYPE_05_OVERLAP)) {
            List<List<Mention>> usingMentions = overlapMentions(termMentions, allGoldMentions);
            allPredictedMentions = usingMentions;
        } else {
            System.err.println(experimentType);
            System.err.println("Unknown experiment type. Using mention detector.");
        }
    } else if (useGoldMention) {
        allPredictedMentions = allGoldMentions;
    }

    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}

From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java

License:Open Source License

protected Collection<String> featuresC(PaddedList<IN> cInfo, int loc) {
    CoreLabel p3 = cInfo.get(loc - 3);/*w ww . j  a v  a 2s  . c om*/
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel c = cInfo.get(loc);
    CoreLabel n = cInfo.get(loc + 1);
    CoreLabel n2 = cInfo.get(loc + 2);

    String cWord = getWord(c);
    String pWord = getWord(p);
    String nWord = getWord(n);
    String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class);
    String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class);
    String nShape = n.getString(CoreAnnotations.ShapeAnnotation.class);

    Collection<String> featuresC = new ArrayList<String>();

    if (flags.useDistSim) {
        distSimAnnotate(cInfo);
    }

    if (flags.useBagOfWords) {
        for (IN word : cInfo) {
            featuresC.add(getWord(word) + "-BAGOFWORDS");
        }
    }

    if (flags.useDistSim && flags.useMoreTags) {
        featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-' + cWord + "-PDISTSIM-CWORD");
    }

    if (flags.useDistSim) {
        featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM");
    }

    if (flags.useTitle) {
        Matcher m = titlePattern.matcher(cWord);
        if (m.matches()) {
            featuresC.add("IS_TITLE");
        }
    }

    if (flags.useInternal && flags.useExternal) {

        if (flags.useWord) {
            featuresC.add(cWord + "-WORD");
        }

        if (flags.use2W) {
            featuresC.add(getWord(p2) + "-P2W");
            featuresC.add(getWord(n2) + "-N2W");
        }

        if (flags.useLC) {
            featuresC.add(cWord.toLowerCase() + "-CL");
            featuresC.add(pWord.toLowerCase() + "-PL");
            featuresC.add(nWord.toLowerCase() + "-NL");
        }

        if (flags.useUnknown) { // for true casing
            featuresC.add(c.get(CoreAnnotations.UnknownAnnotation.class) + "-UNKNOWN");
            featuresC.add(p.get(CoreAnnotations.UnknownAnnotation.class) + "-PUNKNOWN");
            featuresC.add(n.get(CoreAnnotations.UnknownAnnotation.class) + "-NUNKNOWN");
        }

        if (flags.useLemmas) {
            String lem = c.getString(CoreAnnotations.LemmaAnnotation.class);
            if (!"".equals(lem)) {
                featuresC.add(lem + "-LEM");
            }
        }
        if (flags.usePrevNextLemmas) {
            String plem = p.getString(CoreAnnotations.LemmaAnnotation.class);
            String nlem = n.getString(CoreAnnotations.LemmaAnnotation.class);
            if (!"".equals(plem)) {
                featuresC.add(plem + "-PLEM");
            }
            if (!"".equals(nlem)) {
                featuresC.add(nlem + "-NLEM");
            }
        }

        if (flags.checkNameList) {
            try {
                if (lastNames == null) {
                    lastNames = Generics.newHashSet();

                    for (String line : ObjectBank.getLineIterator(flags.lastNameList)) {
                        String[] cols = line.split("\\s+");
                        lastNames.add(cols[0]);
                    }
                }
                if (maleNames == null) {
                    maleNames = Generics.newHashSet();
                    for (String line : ObjectBank.getLineIterator(flags.maleNameList)) {
                        String[] cols = line.split("\\s+");
                        maleNames.add(cols[0]);
                    }
                }
                if (femaleNames == null) {
                    femaleNames = Generics.newHashSet();
                    for (String line : ObjectBank.getLineIterator(flags.femaleNameList)) {
                        String[] cols = line.split("\\s+");
                        femaleNames.add(cols[0]);
                    }
                }

                String name = cWord.toUpperCase();
                if (lastNames.contains(name)) {
                    featuresC.add("LAST_NAME");
                }

                if (maleNames.contains(name)) {
                    featuresC.add("MALE_NAME");
                }

                if (femaleNames.contains(name)) {
                    featuresC.add("FEMALE_NAME");
                }

            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        }

        if (flags.binnedLengths != null) {
            int len = cWord.length();
            String featureName = null;
            for (int i = 0; i <= flags.binnedLengths.length; i++) {
                if (i == flags.binnedLengths.length) {
                    featureName = "Len-" + flags.binnedLengths[flags.binnedLengths.length - 1] + "-Inf";
                } else if (len <= flags.binnedLengths[i]) {
                    featureName = "Len-" + ((i == 0) ? 1 : flags.binnedLengths[i - 1]) + '-'
                            + flags.binnedLengths[i];
                    break;
                }
            }
            featuresC.add(featureName);
        }

        if (flags.useABGENE) {
            featuresC.add(c.get(CoreAnnotations.AbgeneAnnotation.class) + "-ABGENE");
            featuresC.add(p.get(CoreAnnotations.AbgeneAnnotation.class) + "-PABGENE");
            featuresC.add(n.get(CoreAnnotations.AbgeneAnnotation.class) + "-NABGENE");
        }

        if (flags.useABSTRFreqDict) {
            featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT"
                    + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"
                    + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
            featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT"
                    + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT"
                    + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
            featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT"
                    + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT"
                    + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"
                    + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
        }

        if (flags.useABSTR) {
            featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT");
            featuresC.add(p.get(CoreAnnotations.AbstrAnnotation.class) + "-PABSTRACT");
            featuresC.add(n.get(CoreAnnotations.AbstrAnnotation.class) + "-NABSTRACT");
        }

        if (flags.useGENIA) {
            featuresC.add(c.get(CoreAnnotations.GeniaAnnotation.class) + "-GENIA");
            featuresC.add(p.get(CoreAnnotations.GeniaAnnotation.class) + "-PGENIA");
            featuresC.add(n.get(CoreAnnotations.GeniaAnnotation.class) + "-NGENIA");
        }
        if (flags.useWEBFreqDict) {
            featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB"
                    + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"
                    + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
            featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB"
                    + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT"
                    + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
            featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB"
                    + c.get(CoreAnnotations.DictAnnotation.class) + "-DICT"
                    + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ"
                    + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
        }

        if (flags.useWEB) {
            featuresC.add(c.get(CoreAnnotations.WebAnnotation.class) + "-WEB");
            featuresC.add(p.get(CoreAnnotations.WebAnnotation.class) + "-PWEB");
            featuresC.add(n.get(CoreAnnotations.WebAnnotation.class) + "-NWEB");
        }

        if (flags.useIsURL) {
            featuresC.add(c.get(CoreAnnotations.IsURLAnnotation.class) + "-ISURL");
        }
        if (flags.useEntityRule) {
            featuresC.add(c.get(CoreAnnotations.EntityRuleAnnotation.class) + "-ENTITYRULE");
        }
        if (flags.useEntityTypes) {
            featuresC.add(c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ENTITYTYPE");
        }
        if (flags.useIsDateRange) {
            featuresC.add(c.get(CoreAnnotations.IsDateRangeAnnotation.class) + "-ISDATERANGE");
        }

        if (flags.useABSTRFreq) {
            featuresC.add(c.get(CoreAnnotations.AbstrAnnotation.class) + "-ABSTRACT"
                    + c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ");
        }

        if (flags.useFREQ) {
            featuresC.add(c.get(CoreAnnotations.FreqAnnotation.class) + "-FREQ");
        }

        if (flags.useMoreTags) {
            featuresC.add(
                    p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-' + cWord + "-PTAG-CWORD");
        }

        if (flags.usePosition) {
            featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + "-POSITION");
        }
        if (flags.useBeginSent) {
            String pos = c.get(CoreAnnotations.PositionAnnotation.class);
            if ("0".equals(pos)) {
                featuresC.add("BEGIN-SENT");
                featuresC.add(cShape + "-BEGIN-SENT");
            } else if (Integer.toString(cInfo.size() - 1).equals(pos)) {
                featuresC.add("END-SENT");
                featuresC.add(cShape + "-END-SENT");
            } else {
                featuresC.add("IN-SENT");
                featuresC.add(cShape + "-IN-SENT");
            }
        }
        if (flags.useTags) {
            featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TAG");
        }

        if (flags.useOrdinal) {
            if (isOrdinal(cInfo, loc)) {
                featuresC.add("C_ORDINAL");
                if (isOrdinal(cInfo, loc - 1)) {
                    //System.err.print(getWord(p) + " ");
                    featuresC.add("PC_ORDINAL");
                }
                //System.err.println(cWord);
            }
            if (isOrdinal(cInfo, loc - 1)) {
                featuresC.add("P_ORDINAL");
            }
        }

        if (flags.usePrev) {
            featuresC.add(pWord + "-PW");
            if (flags.useTags) {
                featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PTAG");
            }
            if (flags.useDistSim) {
                featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + "-PDISTSIM");
            }
            if (flags.useIsURL) {
                featuresC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + "-PISURL");
            }
            if (flags.useEntityTypes) {
                featuresC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + "-PENTITYTYPE");
            }
        }

        if (flags.useNext) {
            featuresC.add(nWord + "-NW");
            if (flags.useTags) {
                featuresC.add(n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-NTAG");
            }
            if (flags.useDistSim) {
                featuresC.add(n.get(CoreAnnotations.DistSimAnnotation.class) + "-NDISTSIM");
            }
            if (flags.useIsURL) {
                featuresC.add(n.get(CoreAnnotations.IsURLAnnotation.class) + "-NISURL");
            }
            if (flags.useEntityTypes) {
                featuresC.add(n.get(CoreAnnotations.EntityTypeAnnotation.class) + "-NENTITYTYPE");
            }
        }
        /*here, entityTypes refers to the type in the PASCAL IE challenge:
         * i.e. certain words are tagged "Date" or "Location" */

        if (flags.useEitherSideWord) {
            featuresC.add(pWord + "-EW");
            featuresC.add(nWord + "-EW");
        }

        if (flags.useWordPairs) {
            featuresC.add(cWord + '-' + pWord + "-W-PW");
            featuresC.add(cWord + '-' + nWord + "-W-NW");
        }

        if (flags.useSymTags) {
            if (flags.useTags) {
                featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCNTAGS");
                featuresC.add(c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-CNTAGS");
                featuresC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-PCTAGS");
            }
            if (flags.useDistSim) {
                featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + n.get(CoreAnnotations.DistSimAnnotation.class) + "-PCNDISTSIM");
                featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + n.get(CoreAnnotations.DistSimAnnotation.class) + "-CNDISTSIM");
                featuresC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-PCDISTSIM");
            }

        }

        if (flags.useSymWordPairs) {
            featuresC.add(pWord + '-' + nWord + "-SWORDS");
        }

        String pGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures)
                ? p.get(CoreAnnotations.GazAnnotation.class)
                : null;
        String nGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures)
                ? n.get(CoreAnnotations.GazAnnotation.class)
                : null;
        String cGazAnnotation = (flags.useGazFeatures || flags.useMoreGazFeatures)
                ? c.get(CoreAnnotations.GazAnnotation.class)
                : null;
        if (flags.useGazFeatures) {

            if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) {
                featuresC.add(cGazAnnotation + "-GAZ");
            }
            // n
            if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) {
                featuresC.add(nGazAnnotation + "-NGAZ");
            }
            // p
            if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) {
                featuresC.add(pGazAnnotation + "-PGAZ");
            }
        }

        if (flags.useMoreGazFeatures) {
            if (cGazAnnotation != null && !cGazAnnotation.equals(flags.dropGaz)) {
                featuresC.add(cGazAnnotation + '-' + cWord + "-CG-CW-GAZ");

                // c-n
                if (nGazAnnotation != null && !nGazAnnotation.equals(flags.dropGaz)) {
                    featuresC.add(cGazAnnotation + '-' + nGazAnnotation + "-CNGAZ");
                }

                // p-c
                if (pGazAnnotation != null && !pGazAnnotation.equals(flags.dropGaz)) {
                    featuresC.add(pGazAnnotation + '-' + cGazAnnotation + "-PCGAZ");
                }
            }
        }

        if (flags.useAbbr || flags.useMinimalAbbr) {
            featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR");
        }

        if (flags.useAbbr1 || flags.useMinimalAbbr1) {
            if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
                featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + "-ABBR");
            }
        }

        if (flags.useAbbr) {
            featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                    + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR");
            featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                    + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR");
            featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                    + c.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                    + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR");
        }

        if (flags.useAbbr1) {
            if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
                featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                        + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PCABBR");
                featuresC.add(c.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                        + n.get(CoreAnnotations.AbbrAnnotation.class) + "-CNABBR");
                featuresC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                        + c.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                        + n.get(CoreAnnotations.AbbrAnnotation.class) + "-PCNABBR");
            }
        }

        if (flags.useChunks) {
            featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-'
                    + c.get(CoreAnnotations.ChunkAnnotation.class) + "-PCCHUNK");
            featuresC.add(c.get(CoreAnnotations.ChunkAnnotation.class) + '-'
                    + n.get(CoreAnnotations.ChunkAnnotation.class) + "-CNCHUNK");
            featuresC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-'
                    + c.get(CoreAnnotations.ChunkAnnotation.class) + '-'
                    + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK");
        }

        if (flags.useMinimalAbbr) {
            featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB");
        }

        if (flags.useMinimalAbbr1) {
            if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
                featuresC.add(cWord + '-' + c.get(CoreAnnotations.AbbrAnnotation.class) + "-CWABB");
            }
        }

        String prevVB = "", nextVB = "";
        if (flags.usePrevVB) {
            for (int j = loc - 1;; j--) {
                CoreLabel wi = cInfo.get(j);
                if (wi == cInfo.getPad()) {
                    prevVB = "X";
                    featuresC.add("X-PVB");
                    break;
                } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) {
                    featuresC.add(getWord(wi) + "-PVB");
                    prevVB = getWord(wi);
                    break;
                }
            }
        }

        if (flags.useNextVB) {
            for (int j = loc + 1;; j++) {
                CoreLabel wi = cInfo.get(j);
                if (wi == cInfo.getPad()) {
                    featuresC.add("X-NVB");
                    nextVB = "X";
                    break;
                } else if (wi.getString(CoreAnnotations.PartOfSpeechAnnotation.class).startsWith("VB")) {
                    featuresC.add(getWord(wi) + "-NVB");
                    nextVB = getWord(wi);
                    break;
                }
            }
        }

        if (flags.useVB) {
            featuresC.add(prevVB + '-' + nextVB + "-PNVB");
        }

        if (flags.useShapeConjunctions) {
            featuresC.add(c.get(CoreAnnotations.PositionAnnotation.class) + cShape + "-POS-SH");
            if (flags.useTags) {
                featuresC.add(c.tag() + cShape + "-TAG-SH");
            }
            if (flags.useDistSim) {
                featuresC.add(c.get(CoreAnnotations.DistSimAnnotation.class) + cShape + "-DISTSIM-SH");
            }

        }

        if (flags.useWordTag) {
            featuresC.add(cWord + '-' + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-T");
            featuresC.add(cWord + '-' + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-PT");
            featuresC.add(cWord + '-' + n.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-W-NT");
        }

        if (flags.useNPHead) {
            featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-HW");
            if (flags.useTags) {
                featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-"
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-HW-T");
            }
            if (flags.useDistSim) {
                featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-"
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-HW-DISTSIM");
            }
        }

        if (flags.useNPGovernor) {
            featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + "-GW");
            if (flags.useTags) {
                featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-GW-T");
            }
            if (flags.useDistSim) {
                featuresC.add(c.get(CoreAnnotations.GovernorAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM-T1");
            }
        }

        if (flags.useHeadGov) {
            featuresC.add(c.get(TreeCoreAnnotations.HeadWordAnnotation.class) + "-"
                    + c.get(CoreAnnotations.GovernorAnnotation.class) + "-HW_GW");
        }

        if (flags.useClassFeature) {
            featuresC.add("###");
        }

        if (flags.useFirstWord) {
            String firstWord = getWord(cInfo.get(0));
            featuresC.add(firstWord);
        }

        if (flags.useNGrams) {
            Collection<String> subs = null;
            if (flags.cacheNGrams) {
                subs = wordToSubstrings.get(cWord);
            }
            if (subs == null) {
                subs = new ArrayList<String>();
                String word = '<' + cWord + '>';
                if (flags.lowercaseNGrams) {
                    word = word.toLowerCase();
                }
                if (flags.dehyphenateNGrams) {
                    word = dehyphenate(word);
                }
                if (flags.greekifyNGrams) {
                    word = greekify(word);
                }
                // minimum length substring is 2 letters (hardwired)
                // hoist flags.noMidNGrams so only linear in word length for that case
                if (flags.noMidNGrams) {
                    int max = flags.maxNGramLeng >= 0 ? Math.min(flags.maxNGramLeng, word.length())
                            : word.length();
                    for (int j = 2; j <= max; j++) {
                        subs.add(intern('#' + word.substring(0, j) + '#'));
                    }
                    int start = flags.maxNGramLeng >= 0 ? Math.max(0, word.length() - flags.maxNGramLeng) : 0;
                    int lenM1 = word.length() - 1;
                    for (int i = start; i < lenM1; i++) {
                        subs.add(intern('#' + word.substring(i) + '#'));
                    }
                } else {
                    for (int i = 0; i < word.length(); i++) {
                        for (int j = i + 2, max = Math.min(word.length(),
                                i + flags.maxNGramLeng); j <= max; j++) {
                            if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
                                continue;
                            }
                            subs.add(intern('#' + word.substring(i, j) + '#'));
                        }
                    }
                }
                if (flags.cacheNGrams) {
                    wordToSubstrings.put(cWord, subs);
                }
            }
            featuresC.addAll(subs);
            if (flags.conjoinShapeNGrams) {
                for (String str : subs) {
                    String feat = str + '-' + cShape + "-CNGram-CS";
                    featuresC.add(feat);
                }
            }
        }

        if (flags.useGazettes) {
            if (flags.sloppyGazette) {
                Collection<String> entries = wordToGazetteEntries.get(cWord);
                if (entries != null) {
                    featuresC.addAll(entries);
                }
            }
            if (flags.cleanGazette) {
                Collection<GazetteInfo> infos = wordToGazetteInfos.get(cWord);
                if (infos != null) {
                    for (GazetteInfo gInfo : infos) {
                        boolean ok = true;
                        for (int gLoc = 0; gLoc < gInfo.words.length; gLoc++) {
                            ok &= gInfo.words[gLoc].equals(getWord(cInfo.get(loc + gLoc - gInfo.loc)));
                        }
                        if (ok) {
                            featuresC.add(gInfo.feature);
                        }
                    }
                }
            }
        }

        if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
            featuresC.add(cShape + "-TYPE");
            if (flags.useTypeSeqs) {
                featuresC.add(pShape + "-PTYPE");
                featuresC.add(nShape + "-NTYPE");
                featuresC.add(pWord + "..." + cShape + "-PW_CTYPE");
                featuresC.add(cShape + "..." + nWord + "-NW_CTYPE");
                featuresC.add(pShape + "..." + cShape + "-PCTYPE");
                featuresC.add(cShape + "..." + nShape + "-CNTYPE");
                featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE");
            }
        }

        if (flags.useLastRealWord) {
            if (pWord.length() <= 3) {
                // extending this to check for 2 short words doesn't seem to help....
                featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE");
            }
        }

        if (flags.useNextRealWord) {
            if (nWord.length() <= 3) {
                // extending this to check for 2 short words doesn't seem to help....
                featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE");
            }
        }

        if (flags.useOccurrencePatterns) {
            featuresC.addAll(occurrencePatterns(cInfo, loc));
        }

        if (flags.useDisjunctive) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                CoreLabel dn = cInfo.get(loc + i);
                CoreLabel dp = cInfo.get(loc - i);
                featuresC.add(getWord(dn) + "-DISJN");
                if (flags.useDisjunctiveShapeInteraction) {
                    featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS");
                }
                featuresC.add(getWord(dp) + "-DISJP");
                if (flags.useDisjunctiveShapeInteraction) {
                    featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS");
                }
            }
        }

        if (flags.useWideDisjunctive) {
            for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
                featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN");
                featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP");
            }
        }

        if (flags.useEitherSideDisjunctive) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWE");
                featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWE");
            }
        }

        if (flags.useDisjShape) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE");
                // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE");
                featuresC.add(cShape + '-' + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class)
                        + "-CNDISJSHAPE");
                // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE");
            }
        }

        if (flags.useExtraTaggySequences) {
            if (flags.useTags) {
                featuresC.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS");
                featuresC.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS");
            }
            if (flags.useDistSim) {
                featuresC.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1");
                featuresC.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1");
            }
        }

        if (flags.useMUCFeatures) {
            featuresC.add(c.get(CoreAnnotations.SectionAnnotation.class) + "-SECTION");
            featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class) + "-WORD_POSITION");
            featuresC.add(c.get(CoreAnnotations.SentencePositionAnnotation.class) + "-SENT_POSITION");
            featuresC.add(c.get(CoreAnnotations.ParaPositionAnnotation.class) + "-PARA_POSITION");
            featuresC.add(c.get(CoreAnnotations.WordPositionAnnotation.class) + '-'
                    + c.get(CoreAnnotations.ShapeAnnotation.class) + "-WORD_POSITION_SHAPE");
        }
    } else if (flags.useInternal) {

        if (flags.useWord) {
            featuresC.add(cWord + "-WORD");
        }

        if (flags.useNGrams) {
            Collection<String> subs = wordToSubstrings.get(cWord);
            if (subs == null) {
                subs = new ArrayList<String>();
                String word = '<' + cWord + '>';
                if (flags.lowercaseNGrams) {
                    word = word.toLowerCase();
                }
                if (flags.dehyphenateNGrams) {
                    word = dehyphenate(word);
                }
                if (flags.greekifyNGrams) {
                    word = greekify(word);
                }
                for (int i = 0; i < word.length(); i++) {
                    for (int j = i + 2; j <= word.length(); j++) {
                        if (flags.noMidNGrams && i != 0 && j != word.length()) {
                            continue;
                        }
                        if (flags.maxNGramLeng >= 0 && j - i > flags.maxNGramLeng) {
                            continue;
                        }
                        //subs.add(intern("#" + word.substring(i, j) + "#"));
                        subs.add(intern('#' + word.substring(i, j) + '#'));
                    }
                }
                if (flags.cacheNGrams) {
                    wordToSubstrings.put(cWord, subs);
                }
            }
            featuresC.addAll(subs);
            if (flags.conjoinShapeNGrams) {
                String shape = c.get(CoreAnnotations.ShapeAnnotation.class);
                for (String str : subs) {
                    String feat = str + '-' + shape + "-CNGram-CS";
                    featuresC.add(feat);
                }
            }
        }

        if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
            featuresC.add(cShape + "-TYPE");
        }

        if (flags.useOccurrencePatterns) {
            featuresC.addAll(occurrencePatterns(cInfo, loc));
        }

    } else if (flags.useExternal) {

        if (flags.usePrev) {
            featuresC.add(pWord + "-PW");
        }

        if (flags.useNext) {
            featuresC.add(nWord + "-NW");
        }

        if (flags.useWordPairs) {
            featuresC.add(cWord + '-' + pWord + "-W-PW");
            featuresC.add(cWord + '-' + nWord + "-W-NW");
        }

        if (flags.useSymWordPairs) {
            featuresC.add(pWord + '-' + nWord + "-SWORDS");
        }

        if ((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || (flags.useShapeStrings)) {
            if (flags.useTypeSeqs) {
                featuresC.add(pShape + "-PTYPE");
                featuresC.add(nShape + "-NTYPE");
                featuresC.add(pWord + "..." + cShape + "-PW_CTYPE");
                featuresC.add(cShape + "..." + nWord + "-NW_CTYPE");
                if (flags.maxLeft > 0)
                    featuresC.add(pShape + "..." + cShape + "-PCTYPE"); // this one just isn't useful, at least given c,pc,s,ps.  Might be useful 0th-order
                featuresC.add(cShape + "..." + nShape + "-CNTYPE");
                featuresC.add(pShape + "..." + cShape + "..." + nShape + "-PCNTYPE");
            }
        }

        if (flags.useLastRealWord) {
            if (pWord.length() <= 3) {
                featuresC.add(getWord(p2) + "..." + cShape + "-PPW_CTYPE");
            }
        }

        if (flags.useNextRealWord) {
            if (nWord.length() <= 3) {
                featuresC.add(getWord(n2) + "..." + cShape + "-NNW_CTYPE");
            }
        }

        if (flags.useDisjunctive) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                CoreLabel dn = cInfo.get(loc + i);
                CoreLabel dp = cInfo.get(loc - i);
                featuresC.add(getWord(dn) + "-DISJN");
                if (flags.useDisjunctiveShapeInteraction) {
                    featuresC.add(getWord(dn) + '-' + cShape + "-DISJN-CS");
                }
                featuresC.add(getWord(dp) + "-DISJP");
                if (flags.useDisjunctiveShapeInteraction) {
                    featuresC.add(getWord(dp) + '-' + cShape + "-DISJP-CS");
                }
            }
        }

        if (flags.useWideDisjunctive) {
            for (int i = 1; i <= flags.wideDisjunctionWidth; i++) {
                featuresC.add(getWord(cInfo.get(loc + i)) + "-DISJWN");
                featuresC.add(getWord(cInfo.get(loc - i)) + "-DISJWP");
            }
        }

        if (flags.useDisjShape) {
            for (int i = 1; i <= flags.disjunctionWidth; i++) {
                featuresC.add(cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-NDISJSHAPE");
                // featuresC.add(cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-PDISJSHAPE");
                featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + '-'
                        + cInfo.get(loc + i).get(CoreAnnotations.ShapeAnnotation.class) + "-CNDISJSHAPE");
                // featuresC.add(c.get(CoreAnnotations.ShapeAnnotation.class) + "-" + cInfo.get(loc - i).get(CoreAnnotations.ShapeAnnotation.class) + "-CPDISJSHAPE");
            }
        }

    }

    // Stuff to add binary features from the additional columns
    if (flags.twoStage) {
        featuresC.add(c.get(Bin1Annotation.class) + "-BIN1");
        featuresC.add(c.get(Bin2Annotation.class) + "-BIN2");
        featuresC.add(c.get(Bin3Annotation.class) + "-BIN3");
        featuresC.add(c.get(Bin4Annotation.class) + "-BIN4");
        featuresC.add(c.get(Bin5Annotation.class) + "-BIN5");
        featuresC.add(c.get(Bin6Annotation.class) + "-BIN6");
    }

    if (flags.useIfInteger) {
        try {
            int val = Integer.parseInt(cWord);
            if (val > 0)
                featuresC.add("POSITIVE_INTEGER");
            else if (val < 0)
                featuresC.add("NEGATIVE_INTEGER");
            // System.err.println("FOUND INTEGER");
        } catch (NumberFormatException e) {
            // not an integer value, nothing to do
        }
    }

    //Stuff to add arbitrary features
    if (flags.useGenericFeatures) {
        //see if we need to cache the keys
        if (genericAnnotationKeys == null) {
            makeGenericKeyCache(c);
        }
        //now look through the cached keys
        for (Class key : genericAnnotationKeys) {
            //System.err.println("Adding feature: " + CoreLabel.genericValues.get(key) + " with value " + c.get(key));
            if (c.get(key) != null && c.get(key) instanceof Collection) {
                for (Object ob : (Collection) c.get(key)) {
                    featuresC.add(ob + "-" + CoreLabel.genericValues.get(key));
                }
            } else {
                featuresC.add(c.get(key) + "-" + CoreLabel.genericValues.get(key));
            }
        }
    }

    if (flags.useTopics) {
        //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + cWord + "--CWORD");
        featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + "-TopicID");
        featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + "-PTopicID");
        featuresC.add(n.get(CoreAnnotations.TopicAnnotation.class) + "-NTopicID");
        //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-PCNTopicID");
        //featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + '-' + n.get(CoreAnnotations.TopicAnnotation.class) + "-CNTopicID");
        //featuresC.add(p.get(CoreAnnotations.TopicAnnotation.class) + '-' + c.get(CoreAnnotations.TopicAnnotation.class) + "-PCTopicID");
        //featuresC.add(c.get(CoreAnnotations.TopicAnnotation.class) + cShape + "-TopicID-SH");
        //asdasd
    }

    // NER tag annotations from a previous NER system
    if (c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) != null) {
        featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CStackedNERTag");
        featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class)
                + "-WCStackedNERTag");

        if (flags.useNext) {
            featuresC.add(c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-'
                    + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-CNStackedNERTag");
            featuresC.add(cWord + "-" + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-'
                    + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-WCNStackedNERTag");

            if (flags.usePrev) {
                featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-'
                        + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-'
                        + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCNStackedNERTag");
                featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-' + cWord + " -"
                        + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-'
                        + n.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PWCNStackedNERTag");
            }
        }
        if (flags.usePrev) {
            featuresC.add(p.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + '-'
                    + c.get(CoreAnnotations.StackedNamedEntityTagAnnotation.class) + "-PCStackedNERTag");
        }
    }
    if (flags.useWordnetFeatures)
        featuresC.add(c.get(CoreAnnotations.WordnetSynAnnotation.class) + "-WordnetSyn");
    if (flags.useProtoFeatures)
        featuresC.add(c.get(CoreAnnotations.ProtoAnnotation.class) + "-Proto");
    if (flags.usePhraseWordTags)
        featuresC.add(c.get(CoreAnnotations.PhraseWordsTagAnnotation.class) + "-PhraseTag");
    if (flags.usePhraseWords) {
        for (String w : c.get(CoreAnnotations.PhraseWordsAnnotation.class))
            featuresC.add(w + "-PhraseWord");
    }
    if (flags.useCommonWordsFeature)
        featuresC.add(c.get(CoreAnnotations.CommonWordsAnnotation.class));

    if (flags.useRadical && cWord.length() > 0) {
        if (cWord.length() == 1) {
            featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + "-SINGLE-CHAR-RADICAL");
        } else {
            featuresC.add(RadicalMap.getRadical(cWord.charAt(0)) + "-START-RADICAL");
            featuresC.add(RadicalMap.getRadical(cWord.charAt(cWord.length() - 1)) + "-END-RADICAL");
        }
        for (int i = 0; i < cWord.length(); ++i) {
            featuresC.add(RadicalMap.getRadical(cWord.charAt(i)) + "-RADICAL");
        }
    }

    if (flags.splitWordRegex != null && !flags.splitWordRegex.isEmpty()) {
        String[] ws = c.word().split(flags.splitWordRegex);
        for (String s : ws) {
            featuresC.add(s + "-SPLITWORD");
        }
    }
    return featuresC;
}

From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java

License:Open Source License

protected Collection<String> featuresCpC(PaddedList<IN> cInfo, int loc) {
    CoreLabel p = cInfo.get(loc - 1);//www. j  av  a  2 s.  co  m
    CoreLabel c = cInfo.get(loc);
    CoreLabel n = cInfo.get(loc + 1);

    String cWord = getWord(c);
    String pWord = getWord(p);
    String cDS = c.getString(CoreAnnotations.DistSimAnnotation.class);
    String pDS = p.getString(CoreAnnotations.DistSimAnnotation.class);
    String cShape = c.getString(CoreAnnotations.ShapeAnnotation.class);
    String pShape = p.getString(CoreAnnotations.ShapeAnnotation.class);
    Collection<String> featuresCpC = new ArrayList<String>();

    if (flags.noEdgeFeature)
        return featuresCpC;

    if (flags.transitionEdgeOnly) {
        featuresCpC.add("PSEQ");
        return featuresCpC;
    }

    if (flags.useNeighborNGrams) {
        int maxLen = pWord.length();
        if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) {
            maxLen = flags.maxNGramLeng;
        }
        for (int len = 1; len <= maxLen; ++len) {
            featuresCpC.add(pWord.substring(0, len) + "-PREVIOUS-PREFIX");
        }
        for (int pos = pWord.length() - maxLen; pos < pWord.length(); ++pos) {
            featuresCpC.add(pWord.substring(pos, pWord.length()) + "-PREVIOUS-SUFFIX");
        }

        maxLen = cWord.length();
        if (flags.maxNGramLeng >= 0 && flags.maxNGramLeng < maxLen) {
            maxLen = flags.maxNGramLeng;
        }
        for (int len = 1; len <= maxLen; ++len) {
            featuresCpC.add(cWord.substring(0, len) + "-CURRENT-PREFIX");
        }
        for (int pos = cWord.length() - maxLen; pos < cWord.length(); ++pos) {
            featuresCpC.add(cWord.substring(pos, cWord.length()) + "-CURRENT-SUFFIX");
        }
    }

    if (flags.useInternal && flags.useExternal) {

        if (flags.useOrdinal) {
            if (isOrdinal(cInfo, loc)) {
                featuresCpC.add("C_ORDINAL");
                if (isOrdinal(cInfo, loc - 1)) {
                    featuresCpC.add("PC_ORDINAL");
                }
            }
            if (isOrdinal(cInfo, loc - 1)) {
                featuresCpC.add("P_ORDINAL");
            }
        }

        if (flags.useAbbr || flags.useMinimalAbbr) {
            featuresCpC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                    + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PABBRANS");
        }

        if (flags.useAbbr1 || flags.useMinimalAbbr1) {
            if (!c.get(CoreAnnotations.AbbrAnnotation.class).equals("XX")) {
                featuresCpC.add(p.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                        + c.get(CoreAnnotations.AbbrAnnotation.class) + "-PABBRANS");
            }
        }

        if (flags.useChunkySequences) {
            featuresCpC.add(p.get(CoreAnnotations.ChunkAnnotation.class) + '-'
                    + c.get(CoreAnnotations.ChunkAnnotation.class) + '-'
                    + n.get(CoreAnnotations.ChunkAnnotation.class) + "-PCNCHUNK");
        }

        if (flags.usePrev) {
            if (flags.useSequences && flags.usePrevSequences) {
                featuresCpC.add("PSEQ");
                featuresCpC.add(cWord + "-PSEQW");

                /*if ( ! flags.strictGoodCoNLL) {
                featuresCpC.add(pWord+ '-' +cWord + "-PSEQW2");  // added later after goodCoNLL
                featuresCpC.add(pWord + "-PSEQpW"); // added later after goodCoNLL
                }
                        
                if (flags.useDistSim) {
                featuresCpC.add(pDS + "-PSEQpDS");
                featuresCpC.add(cDS + "-PSEQcDS");
                featuresCpC.add(pDS+ '-' +cDS + "-PSEQpcDS");
                }
                        
                if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings)) {
                if ( ! flags.strictGoodCoNLL) {     // These ones were added later after goodCoNLL
                    featuresCpC.add(pShape + "-PSEQpS");
                    featuresCpC.add(cShape + "-PSEQcS");
                }
                if (flags.strictGoodCoNLL && ! flags.removeStrictGoodCoNLLDuplicates) {
                    featuresCpC.add(pShape + '-' + cShape + "-PSEQpcS"); // Duplicate (in goodCoNLL orig, see -TYPES below)
                }
                }*/
            }
        }

        if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs
                && (flags.useTypeSeqs2 || flags.useTypeSeqs3)) {
            if (flags.useTypeSeqs3) {
                featuresCpC.add(pShape + '-' + cShape + '-' + n.get(CoreAnnotations.ShapeAnnotation.class)
                        + "-PCNSHAPES");
            }
            if (flags.useTypeSeqs2) {
                featuresCpC.add(pShape + '-' + cShape + "-TYPES"); // this duplicates PSEQpcS above
            }

            if (flags.useYetMoreCpCShapes) {
                String p2Shape = cInfo.get(loc - 2).getString(CoreAnnotations.ShapeAnnotation.class);
                featuresCpC.add(p2Shape + '-' + pShape + '-' + cShape + "-YMS");
                featuresCpC.add(pShape + '-' + cShape + "-" + n.getString(CoreAnnotations.ShapeAnnotation.class)
                        + "-YMSPCN");
            }
        }

        if (flags.useTypeySequences) {
            featuresCpC.add(cShape + "-TPS2");
            featuresCpC.add(n.get(CoreAnnotations.ShapeAnnotation.class) + "-TNS1");
            // featuresCpC.add(pShape) + "-" + cShape) + "-TPS"); // duplicates -TYPES, so now omitted; you may need to slightly increase sigma to duplicate previous results, however.
        }

        if (flags.useTaggySequences) {
            if (flags.useTags) {
                featuresCpC.add(p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TS");
            }
            if (flags.useDistSim) {
                featuresCpC.add(p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TS1");
            }
        }

        if (flags.useParenMatching) {
            if (flags.useReverse) {
                if (cWord.equals("(") || cWord.equals("[") || cWord.equals("-LRB-")) {
                    if (pWord.equals(")") || pWord.equals("]") || pWord.equals("-RRB-")) {
                        featuresCpC.add("PAREN-MATCH");
                    }
                }
            } else {
                if (cWord.equals(")") || cWord.equals("]") || cWord.equals("-RRB-")) {
                    if (pWord.equals("(") || pWord.equals("[") || pWord.equals("-LRB-")) {
                        featuresCpC.add("PAREN-MATCH");
                    }
                }
            }
        }
        if (flags.useEntityTypeSequences) {
            featuresCpC.add(p.get(CoreAnnotations.EntityTypeAnnotation.class) + '-'
                    + c.get(CoreAnnotations.EntityTypeAnnotation.class) + "-ETSEQ");
        }
        if (flags.useURLSequences) {
            featuresCpC.add(p.get(CoreAnnotations.IsURLAnnotation.class) + '-'
                    + c.get(CoreAnnotations.IsURLAnnotation.class) + "-URLSEQ");
        }
    } else if (flags.useInternal) {

        if (flags.useSequences && flags.usePrevSequences) {
            featuresCpC.add("PSEQ");
            featuresCpC.add(cWord + "-PSEQW");
        }

        if (flags.useTypeySequences) {
            featuresCpC.add(cShape + "-TPS2");
        }

    } else if (flags.useExternal) {

        if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs
                && (flags.useTypeSeqs2 || flags.useTypeSeqs3)) {
            if (flags.useTypeSeqs3) {
                featuresCpC.add(pShape + '-' + cShape + '-' + n.get(CoreAnnotations.ShapeAnnotation.class)
                        + "-PCNSHAPES");
            }
            if (flags.useTypeSeqs2) {
                featuresCpC.add(pShape + '-' + cShape + "-TYPES");
            }
        }

        if (flags.useTypeySequences) {
            featuresCpC.add(n.get(CoreAnnotations.ShapeAnnotation.class) + "-TNS1");
            featuresCpC.add(pShape + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TPS");
        }
    }

    return featuresCpC;
}

From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java

License:Open Source License

protected Collection<String> featuresCpCp2C(PaddedList<IN> cInfo, int loc) {
    CoreLabel c = cInfo.get(loc);//from  w w w  . j  a va  2 s.co  m
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);

    String pWord = getWord(p);
    // String p2Word = getWord(p2);

    Collection<String> featuresCpCp2C = new ArrayList<String>();

    if (flags.useInternal && flags.useExternal) {

        /*if (flags.strictGoodCoNLL && ! flags.removeStrictGoodCoNLLDuplicates && flags.useTypeySequences && flags.maxLeft >= 2) {
        // this feature duplicates -TYPETYPES below, so probably don't include it, but it was in original tests of CMM goodCoNLL
        featuresCpCp2C.add(p2.get(CoreAnnotations.ShapeAnnotation.class) + '-' + p.get(CoreAnnotations.ShapeAnnotation.class) + '-' + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTPS");
        }*/

        if (flags.useAbbr) {
            featuresCpCp2C.add(p2.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                    + p.get(CoreAnnotations.AbbrAnnotation.class) + '-'
                    + c.get(CoreAnnotations.AbbrAnnotation.class) + "-2PABBRANS");
        }

        if (flags.useChunks) {
            featuresCpCp2C.add(p2.get(CoreAnnotations.ChunkAnnotation.class) + '-'
                    + p.get(CoreAnnotations.ChunkAnnotation.class) + '-'
                    + c.get(CoreAnnotations.ChunkAnnotation.class) + "-2PCHUNKS");
        }

        if (flags.useLongSequences) {
            featuresCpCp2C.add("PPSEQ");
        }
        if (flags.useBoundarySequences && pWord.equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) {
            featuresCpCp2C.add("BNDRY-SPAN-PPSEQ");
        }
        // This more complex consistency checker didn't help!
        // if (flags.useBoundarySequences) {
        //   // try enforce consistency over "and" and "," as well as boundary
        //   if (pWord.equals(CoNLLDocumentIteratorFactory.BOUNDARY) ||
        //       pWord.equalsIgnoreCase("and") || pWord.equalsIgnoreCase("or") ||
        //       pWord.equals(",")) {
        //   }
        // }

        if (flags.useTaggySequences) {
            if (flags.useTags) {
                featuresCpCp2C.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTS");
                if (flags.useTaggySequencesShapeInteraction) {
                    featuresCpCp2C.add(p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                            + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                            + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                            + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTS-CS");
                }
            }
            if (flags.useDistSim) {
                featuresCpCp2C.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTS1");
                if (flags.useTaggySequencesShapeInteraction) {
                    featuresCpCp2C.add(p2.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                            + p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                            + c.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                            + c.get(CoreAnnotations.ShapeAnnotation.class) + "-DISTSIM_TTS1-CS");
                }
            }
        }

        if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs
                && flags.useTypeSeqs2 && flags.maxLeft >= 2) {
            String cShape = c.get(CoreAnnotations.ShapeAnnotation.class);
            String pShape = p.get(CoreAnnotations.ShapeAnnotation.class);
            String p2Shape = p2.get(CoreAnnotations.ShapeAnnotation.class);
            featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES");
        }
    } else if (flags.useInternal) {

        if (flags.useLongSequences) {
            featuresCpCp2C.add("PPSEQ");
        }
    } else if (flags.useExternal) {

        if (flags.useLongSequences) {
            featuresCpCp2C.add("PPSEQ");
        }

        if (((flags.wordShape > WordShapeClassifier.NOWORDSHAPE) || flags.useShapeStrings) && flags.useTypeSeqs
                && flags.useTypeSeqs2 && flags.maxLeft >= 2) {
            String cShape = c.get(CoreAnnotations.ShapeAnnotation.class);
            String pShape = p.get(CoreAnnotations.ShapeAnnotation.class);
            String p2Shape = p2.get(CoreAnnotations.ShapeAnnotation.class);
            featuresCpCp2C.add(p2Shape + '-' + pShape + '-' + cShape + "-TYPETYPES");
        }
    }

    return featuresCpCp2C;
}

From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java

License:Open Source License

protected Collection<String> featuresCpCp2Cp3C(PaddedList<IN> cInfo, int loc) {
    CoreLabel c = cInfo.get(loc);/*from w  w  w . j a  v  a 2 s  .co  m*/
    CoreLabel p = cInfo.get(loc - 1);
    CoreLabel p2 = cInfo.get(loc - 2);
    CoreLabel p3 = cInfo.get(loc - 3);

    Collection<String> featuresCpCp2Cp3C = new ArrayList<String>();

    if (flags.useTaggySequences) {
        if (flags.useTags) {
            if (flags.maxLeft >= 3 && !flags.dontExtendTaggy) {
                featuresCpCp2Cp3C.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                        + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + "-TTTS");
                if (flags.useTaggySequencesShapeInteraction) {
                    featuresCpCp2Cp3C.add(p3.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                            + p2.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                            + p.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                            + c.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '-'
                            + c.get(CoreAnnotations.ShapeAnnotation.class) + "-TTTS-CS");
                }
            }
        }
        if (flags.useDistSim) {
            if (flags.maxLeft >= 3 && !flags.dontExtendTaggy) {
                featuresCpCp2Cp3C.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                        + c.get(CoreAnnotations.DistSimAnnotation.class) + "-DISTSIM_TTTS1");
                if (flags.useTaggySequencesShapeInteraction) {
                    featuresCpCp2Cp3C.add(p3.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                            + p2.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                            + p.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                            + c.get(CoreAnnotations.DistSimAnnotation.class) + '-'
                            + c.get(CoreAnnotations.ShapeAnnotation.class) + "-DISTSIM_TTTS1-CS");
                }
            }
        }
    }

    if (flags.maxLeft >= 3) {
        if (flags.useLongSequences) {
            featuresCpCp2Cp3C.add("PPPSEQ");
        }
        if (flags.useBoundarySequences && getWord(p).equals(CoNLLDocumentReaderAndWriter.BOUNDARY)) {
            featuresCpCp2Cp3C.add("BNDRY-SPAN-PPPSEQ");
        }
    }

    return featuresCpCp2Cp3C;
}

From source file:knu.univ.lingvo.coref.MUCMentionExtractor.java

License:Open Source License

@Override
public Document nextDoc() throws Exception {
    List<List<CoreLabel>> allWords = new ArrayList<List<CoreLabel>>();
    List<Tree> allTrees = new ArrayList<Tree>();
    List<List<Mention>> allGoldMentions = new ArrayList<List<Mention>>();
    List<List<Mention>> allPredictedMentions;
    List<CoreMap> allSentences = new ArrayList<CoreMap>();
    Annotation docAnno = new Annotation("");

    Pattern docPattern = Pattern.compile("<DOC>(.*?)</DOC>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Pattern sentencePattern = Pattern.compile("(<s>|<hl>|<dd>|<DATELINE>)(.*?)(</s>|</hl>|</dd>|</DATELINE>)",
            Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docMatcher = docPattern.matcher(fileContents);
    if (!docMatcher.find(currentOffset))
        return null;

    currentOffset = docMatcher.end();// ww w .j av a2  s.  c o  m
    String doc = docMatcher.group(1);
    Matcher sentenceMatcher = sentencePattern.matcher(doc);
    String ner = null;

    //Maintain current document ID.
    Pattern docIDPattern = Pattern.compile("<DOCNO>(.*?)</DOCNO>", Pattern.DOTALL + Pattern.CASE_INSENSITIVE);
    Matcher docIDMatcher = docIDPattern.matcher(doc);
    if (docIDMatcher.find())
        currentDocumentID = docIDMatcher.group(1);
    else
        currentDocumentID = "documentAfter " + currentDocumentID;

    while (sentenceMatcher.find()) {
        String sentenceString = sentenceMatcher.group(2);
        List<CoreLabel> words = tokenizerFactory.getTokenizer(new StringReader(sentenceString)).tokenize();

        // FIXING TOKENIZATION PROBLEMS
        for (int i = 0; i < words.size(); i++) {
            CoreLabel w = words.get(i);
            if (i > 0 && w.word().equals("$")) {
                if (!words.get(i - 1).word().endsWith("PRP") && !words.get(i - 1).word().endsWith("WP"))
                    continue;
                words.get(i - 1).set(CoreAnnotations.TextAnnotation.class, words.get(i - 1).word() + "$");
                words.remove(i);
                i--;
            } else if (w.word().equals("\\/")) {
                if (words.get(i - 1).word().equals("</COREF>"))
                    continue;
                w.set(CoreAnnotations.TextAnnotation.class,
                        words.get(i - 1).word() + "\\/" + words.get(i + 1).word());
                words.remove(i + 1);
                words.remove(i - 1);
            }
        }
        // END FIXING TOKENIZATION PROBLEMS

        List<CoreLabel> sentence = new ArrayList<CoreLabel>();
        // MUC accepts embedded coref mentions, so we need to keep a stack for the mentions currently open
        Stack<Mention> stack = new Stack<Mention>();
        List<Mention> mentions = new ArrayList<Mention>();

        allWords.add(sentence);
        allGoldMentions.add(mentions);

        for (CoreLabel word : words) {
            String w = word.get(CoreAnnotations.TextAnnotation.class);
            // found regular token: WORD/POS
            if (!w.startsWith("<") && w.contains("\\/") && w.lastIndexOf("\\/") != w.length() - 2) {
                int i = w.lastIndexOf("\\/");
                String w1 = w.substring(0, i);
                // we do NOT set POS info here. We take the POS tags from the parser!
                word.set(CoreAnnotations.TextAnnotation.class, w1);
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
            // found the start SGML tag for a NE, e.g., "<ORGANIZATION>"
            else if (w.startsWith("<") && !w.startsWith("<COREF") && !w.startsWith("</")) {
                Pattern nerPattern = Pattern.compile("<(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                ner = m.group(1);
            }
            // found the end SGML tag for a NE, e.g., "</ORGANIZATION>"
            else if (w.startsWith("</") && !w.startsWith("</COREF")) {
                Pattern nerPattern = Pattern.compile("</(.*?)>");
                Matcher m = nerPattern.matcher(w);
                m.find();
                String ner1 = m.group(1);
                if (ner != null && !ner.equals(ner1))
                    throw new RuntimeException("Unmatched NE labels in MUC file: " + ner + " v. " + ner1);
                ner = null;
            }
            // found the start SGML tag for a coref mention
            else if (w.startsWith("<COREF")) {
                Mention mention = new Mention();
                // position of this mention in the sentence
                mention.startIndex = sentence.size();

                // extract GOLD info about this coref chain. needed for eval
                Pattern idPattern = Pattern.compile("ID=\"(.*?)\"");
                Pattern refPattern = Pattern.compile("REF=\"(.*?)\"");

                Matcher m = idPattern.matcher(w);
                m.find();
                mention.mentionID = Integer.parseInt(m.group(1));

                m = refPattern.matcher(w);
                if (m.find()) {
                    mention.originalRef = Integer.parseInt(m.group(1));
                }

                // open mention. keep track of all open mentions using the stack
                stack.push(mention);
            }
            // found the end SGML tag for a coref mention
            else if (w.equals("</COREF>")) {
                Mention mention = stack.pop();
                mention.endIndex = sentence.size();

                // this is a closed mention. add it to the final list of mentions
                // System.err.printf("Found MENTION: ID=%d, REF=%d\n", mention.mentionID, mention.originalRef);
                mentions.add(mention);
            } else {
                word.remove(CoreAnnotations.OriginalTextAnnotation.class);
                if (Constants.USE_GOLD_NE) {
                    if (ner != null) {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, ner);
                    } else {
                        word.set(CoreAnnotations.NamedEntityTagAnnotation.class, "O");
                    }
                }
                sentence.add(word);
            }
        }
        StringBuilder textContent = new StringBuilder();
        for (int i = 0; i < sentence.size(); i++) {
            CoreLabel w = sentence.get(i);
            w.set(CoreAnnotations.IndexAnnotation.class, i + 1);
            w.set(CoreAnnotations.UtteranceAnnotation.class, 0);
            if (i > 0)
                textContent.append(" ");
            textContent.append(w.getString(CoreAnnotations.TextAnnotation.class));
        }
        CoreMap sentCoreMap = new Annotation(textContent.toString());
        allSentences.add(sentCoreMap);
        sentCoreMap.set(CoreAnnotations.TokensAnnotation.class, sentence);
    }

    // assign goldCorefClusterID
    Map<Integer, Mention> idMention = Generics.newHashMap(); // temporary use
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            idMention.put(m.mentionID, m);
        }
    }
    for (List<Mention> goldMentions : allGoldMentions) {
        for (Mention m : goldMentions) {
            if (m.goldCorefClusterID == -1) {
                if (m.originalRef == -1)
                    m.goldCorefClusterID = m.mentionID;
                else {
                    int ref = m.originalRef;
                    while (true) {
                        Mention m2 = idMention.get(ref);
                        if (m2.goldCorefClusterID != -1) {
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else if (m2.originalRef == -1) {
                            m2.goldCorefClusterID = m2.mentionID;
                            m.goldCorefClusterID = m2.goldCorefClusterID;
                            break;
                        } else {
                            ref = m2.originalRef;
                        }
                    }
                }
            }
        }
    }

    docAnno.set(CoreAnnotations.SentencesAnnotation.class, allSentences);
    stanfordProcessor.annotate(docAnno);

    if (allSentences.size() != allWords.size())
        throw new IllegalStateException("allSentences != allWords");
    for (int i = 0; i < allSentences.size(); i++) {
        List<CoreLabel> annotatedSent = allSentences.get(i).get(CoreAnnotations.TokensAnnotation.class);
        List<CoreLabel> unannotatedSent = allWords.get(i);
        List<Mention> mentionInSent = allGoldMentions.get(i);
        for (Mention m : mentionInSent) {
            m.dependency = allSentences.get(i)
                    .get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        }
        if (annotatedSent.size() != unannotatedSent.size()) {
            throw new IllegalStateException("annotatedSent != unannotatedSent");
        }
        for (int j = 0, sz = annotatedSent.size(); j < sz; j++) {
            CoreLabel annotatedWord = annotatedSent.get(j);
            CoreLabel unannotatedWord = unannotatedSent.get(j);
            if (!annotatedWord.get(CoreAnnotations.TextAnnotation.class)
                    .equals(unannotatedWord.get(CoreAnnotations.TextAnnotation.class))) {
                throw new IllegalStateException("annotatedWord != unannotatedWord");
            }
        }
        allWords.set(i, annotatedSent);
        allTrees.add(allSentences.get(i).get(TreeCoreAnnotations.TreeAnnotation.class));
    }

    // extract predicted mentions
    if (Constants.USE_GOLD_MENTIONS)
        allPredictedMentions = allGoldMentions;
    else
        allPredictedMentions = mentionFinder.extractPredictedMentions(docAnno, maxID, dictionaries);

    // add the relevant fields to mentions and order them for coref
    return arrange(docAnno, allWords, allTrees, allPredictedMentions, allGoldMentions, true);
}

From source file:lv.lumii.expressions.Expression.java

License:Open Source License

public void loadUsingTagger(String phrase, boolean knownLemma, boolean debug) {
        expWords = new LinkedList<ExpressionWord>();

        // risin?jums fr?zm form? "biedrs-dibin?t?js"
        // FIXME - varbt o visp?rgaj? loct?j? j?ienes?
        if (phrase.matches("\\p{IsLatin}+-\\p{IsLatin}+") && this.category != Category.hum)
            phrase = phrase.replace("-", " - ");

        List<Word> words = Splitting.tokenize(analyzer, phrase);
        for (Word word : words) { // filtrjam variantus, emot vr? to ko zinam par fr?zi un kategoriju
            if (debug) {
                System.out.printf("%s normal analysis:\n", word.getToken());
                //word.describe(System.out);
                for (Wordform wf : word.wordforms)
                    System.out.printf("\t%s\n", wf.getTag());
            }//  w  w w  .  j a v a2  s  .c o  m

            addExtraPossibilities(word, knownLemma, debug); // Pietjnta minana, emot v?rd? named entity patnbas           

            if (debug) {
                System.out.printf("%s generated alternatives:\n", word.getToken());
                for (Wordform wf : word.wordforms)
                    System.out.printf("\t%s\n", wf.getTag());
            }
        }

        if (category == Category.hum)
            gender = guessPersonGender(words);

        if (debug)
            System.out.printf("Detected gender : %s\n", gender.toString());

        for (Word word : words) {
            // ja fr?zei kopum? ir skaidra dzimte, tad izmetam 'nepareiz?s' dzimtes alternatvas
            if (category == Category.hum && gender != Gender.unknown) {
                LinkedList<Wordform> izmetamie = new LinkedList<Wordform>();
                for (Wordform wf : word.wordforms) {
                    Gender tempgender = gender; // default option - same as the whole name
                    if (gender == Gender.feminine && wf.getToken().endsWith("kalns")) // Exception for compound masculine words used as female surnames e.g. 'Zaaiskalns'
                        tempgender = Gender.masculine;

                    if ((tempgender == Gender.masculine
                            && wf.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Feminine))
                            || (tempgender == Gender.feminine
                                    && wf.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Masculine)))
                        izmetamie.add(wf);
                }
                word.wordforms.removeAll(izmetamie); // TODO - te ne?eko, vai nav izmesti visi visi varianti - teortiski guessPersonGender ?dus gadjumus nepieaus
            }

            if (category == Category.hum) {
                LinkedList<Wordform> izmetamie = new LinkedList<Wordform>();
                for (Wordform wf : word.wordforms) {
                    if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective)
                            && wf.isMatchingStrong(AttributeNames.i_Definiteness, AttributeNames.v_Indefinite)
                            && wf.isMatchingStrong(AttributeNames.i_CapitalLetters, AttributeNames.v_FirstUpper))
                        izmetamie.add(wf); // Problma, ka k?du pav?rdu (piem. Znaroks) tageris nosauc par nenoteikto pabas v?rdu - tas der tikai noteiktajiem!

                    if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adverb))
                        izmetamie.add(wf); // inflexive -i surnames (Maija Kubli)

                    // Pieemam, ka noteikto pabas v?rdu uzv?rdi (Platais, Lielais utml) var bt tikai no in-vocabulary v?rdiem vai ar ja ir explicitly pateikts ka t? ir pamatforma, p?rjiem j?em k? lietv?rda forma             
                    if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Adjective)
                            && wf.isMatchingStrong(AttributeNames.i_Definiteness, AttributeNames.v_Definite)
                            && wf.isMatchingStrong(AttributeNames.i_Guess, AttributeNames.v_Ending) && !knownLemma)
                        izmetamie.add(wf);
                }
                word.wordforms.removeAll(izmetamie);
                if (izmetamie.size() > 0 && word.wordforms.size() == 0) {
                    // Ja is process noveda peie t?, ka izmet?m visus visus variantus... tad j?iesldz minana un j?uzmin tiei lietv?rdi ! 
                    Word extra_possibilities = analyzer.guessByEnding(word.getToken().toLowerCase(),
                            word.getToken());
                    for (Wordform new_wf : extra_possibilities.wordforms) {
                        if ((new_wf.isMatchingWeak(AttributeNames.i_Gender, AttributeNames.v_Masculine)
                                && gender != Gender.feminine)
                                || (new_wf.isMatchingWeak(AttributeNames.i_Gender, AttributeNames.v_Feminine)
                                        && gender != Gender.masculine)) {
                            word.addWordform(new_wf);
                        }
                    }
                }
            } else { // ja nav category == Category.hum
                if (category == Category.other && knownLemma && word == words.get(words.size() - 1)) {
                    // nestandarta fr?zm - pieemot, ka t? bs lietv?rda fr?ze - apcrpam alternatvas pdjam v?rdam, lai to nenotago piem k? dsk enitvu
                    LinkedList<Wordform> izmetamie = new LinkedList<Wordform>();
                    for (Wordform wf : word.wordforms) {
                        if (wf.isMatchingStrong(AttributeNames.i_Case, AttributeNames.v_Genitive))
                            izmetamie.add(wf); // Problma, ka k?du pav?rdu (piem. Znaroks) tageris nosauc par nenoteikto pabas v?rdu - tas der tikai noteiktajiem!
                    }
                    if (izmetamie.size() < word.wordforms.size()) // ja ir kaut viens dergs 
                        word.wordforms.removeAll(izmetamie);
                }
            }

            // Blacklist of confusing but unlikely lemmas
            List<String> blacklist = Arrays.asList("vlan?s");
            LinkedList<Wordform> izmetamie = new LinkedList<Wordform>();
            for (Wordform wf : word.wordforms) {
                if (blacklist.contains(wf.getValue(AttributeNames.i_Lemma))) {
                    izmetamie.add(wf);
                }
            }
            if (izmetamie.size() < word.wordforms.size()) // ja ir kaut viens dergs 
                word.wordforms.removeAll(izmetamie);
        }

        /*
        if (category == Category.hum && bothGendersPossible) {
           // FIXME - "Andra Brzia" gadjums, lai neizdom? ka viens no v?rdiem tomr ir sievieu dzimt.
           // kamr tageris ?dus ne vienmr atrisina, ir is workaround - pieemam, ka ja nu var bt viskautkas, tad tas ir vrieu dzimt; jo re?lajos datos male:female proporcija ir 80:20-95:05.
           for (Word word: words) {
         LinkedList<Wordform> izmetamie = new LinkedList<Wordform>();
         for (Wordform wf : word.wordforms) {
            if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Noun) &&
                   wf.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Feminine))
               izmetamie.add(wf);
         }
         if (izmetamie.size() < word.wordforms.size()) // ja ir kaut viens dergs 
            word.wordforms.removeAll(izmetamie);
           }
        } */

        if (debug)
            for (Word word : words) {
                System.out.printf("%s alternatives given to tagger:\n", word.getToken());
                for (Wordform wf : word.wordforms)
                    System.out.printf("\t%s\n", wf.getTag());
            }

        List<CoreLabel> sentence = LVMorphologyReaderAndWriter.analyzeSentence2(words);
        sentence = morphoClassifier.classify(sentence); //TODO - tageris ir uztrents uz pilniem teikumiem, nevis ?d?m fr?zm. Ja izveidotu pai piel?gotu tagera modeli, tad tas vartu bt daudz precz?ks.

        String token;
        Word analysis;
        Wordform maxwf;
        for (CoreLabel label : sentence) {
            token = label.getString(TextAnnotation.class);

            if (token.equals("<s>")) { // Tageris skat?s uz v?rda apkaimi; teikuma s?kuma/beigu v?rdi ir pai, to signaliz pieliekot s?kum?/beig?s <s>
                continue;
            }

            analysis = label.get(LVMorphologyAnalysis.class);

            maxwf = analysis.getMatchingWordform(label.getString(AnswerAnnotation.class), false);
            if (maxwf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Verb)) {
                // Ms varam pieemt ka enttijas ir 'nounphrase' un ja beig?s ir verbs (nevis divdabis) tad tas ir tagera guks (piemrs 'DPS saraksta')
                //                                                      ^^ FIXME - a k?pc tad te ?eko *visiem* v?rdiem nevis tikai pdjam?
                for (Wordform wf : analysis.wordforms) {
                    if (wf.isMatchingStrong(AttributeNames.i_PartOfSpeech, AttributeNames.v_Noun))
                        maxwf = wf; // TODO - varbt var mazliet gudr?k, ja ir vair?ki kas atbilst tagera datiem tad emt ticam?ko
                }
            }

            if (debug)
                System.out.printf("%s chosen : %s\n", maxwf.getToken(), maxwf.getTag());

            ExpressionWord tmp = new ExpressionWord(analysis, maxwf);
            expWords.add(tmp);
        }

        if (category == Category.hum && gender == Gender.unknown) {
            boolean allMale = true;
            boolean allFemale = true;
            for (ExpressionWord w : expWords) {
                if (w.correctWordform.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Masculine))
                    allFemale = false;
                if (w.correctWordform.isMatchingStrong(AttributeNames.i_Gender, AttributeNames.v_Feminine))
                    allMale = false;
            }
            if (allMale)
                gender = Gender.masculine;
            if (allFemale)
                gender = Gender.feminine;
            if (debug)
                System.out.printf("Final gender : %s\n", gender.toString());
        }
    }

From source file:lv.lumii.morphotagger.MorphoCRF.java

License:Open Source License

private static void testData(AbstractSequenceClassifier<CoreLabel> crf, String filename,
        DocumentReaderAndWriter<CoreLabel> reader) {
    try {//from  w ww.j  a va 2 s.  c om
        PrintWriter izeja = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));

        ObjectBank<List<CoreLabel>> documents = crf.makeObjectBankFromFile(filename, reader);

        int correct_tag = 0;
        int correct_lemma = 0;
        int correct_all = 0;
        int total = 0;
        Collection<AttributeValues> errors = new LinkedList<AttributeValues>();

        for (List<CoreLabel> document : documents) {
            List<CoreLabel> out = crf.classify(document);

            System.out.println("-----");
            for (CoreLabel word : out) {
                String token = word.word();
                if (token.contains("<s>") || token.contains("</s>"))
                    continue;

                String answer = word.get(AnswerAnnotation.class);
                Word analysis = word.get(LVMorphologyAnalysis.class);
                Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false); //complain about potential lemma errors
                String lemma = maxwf.getValue(AttributeNames.i_Lemma);

                String gold_tag = word.get(GoldAnswerAnnotation.class);
                String gold_lemma = word.get(LemmaAnnotation.class); // The lemma that's written in the test data

                AttributeValues gold_tags = MarkupConverter.fromKamolsMarkup(gold_tag);
                AttributeValues found_tags = MarkupConverter.fromKamolsMarkup(answer);
                errors.add(compareAVs(gold_tags, found_tags));

                total++;

                if (gold_lemma == null || gold_lemma.equalsIgnoreCase(lemma))
                    correct_lemma++;
                else {
                    //System.out.println(String.format("word: %s, tag:%s, gold_lemma: '%s', lemma: '%s'", token, answer, gold_lemma, lemma));
                }

                if (match(gold_tags, found_tags)) {
                    correct_tag++;
                    if (gold_lemma == null)
                        System.out.println("Nav lemmas? " + token);
                    if (gold_lemma != null && gold_lemma.equalsIgnoreCase(lemma))
                        correct_all++;
                } else {
                    System.out.println(
                            "v?rds: " + token + ", pareizais: " + gold_tag + ", autom?tiskais: " + answer);
                    //compareAVs(pareizie, atrastie).describe(new PrintWriter(System.out));
                }
            }
        }

        izeja.printf("\nEvaluation results:\n");
        izeja.printf("\tCorrect tag:\t%4.1f%%\t%d\n", correct_tag * 100.0 / total, total - correct_tag);
        izeja.printf("\tCorrect lemma:\t%4.1f%%\t%d\n", correct_lemma * 100.0 / total, total - correct_lemma);
        izeja.printf("\tCorrect all:\t%4.1f%%\t%d\n", correct_all * 100.0 / total, total - correct_all);
        summarizeErrors(errors, izeja);
        izeja.flush();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:lv.lumii.morphotagger.MorphoPipe.java

License:Open Source License

private static String output_JSON(List<CoreLabel> tokens) {
    LinkedList<String> tokenJSON = new LinkedList<String>();

    for (CoreLabel word : tokens) {
        String token = word.getString(TextAnnotation.class);
        if (token.contains("<s>"))
            continue;
        Word analysis = word.get(LVMorphologyAnalysis.class);
        Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false);
        if (mini_tag)
            maxwf.removeNonlexicalAttributes();
        if (maxwf != null)
            tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"%s\",\"Lemma\":\"%s\"}",
                    JSONValue.escape(token), JSONValue.escape(maxwf.getTag()),
                    JSONValue.escape(maxwf.getValue(AttributeNames.i_Lemma))));
        else/*from   w  ww . j  a  v  a2  s  . co  m*/
            tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"-\",\"Lemma\":\"%s\"}",
                    JSONValue.escape(token), JSONValue.escape(token)));
    }

    String s = formatJSON(tokenJSON).toString();
    tokens = null;
    tokenJSON = null;

    return s;
}

From source file:lv.lumii.morphotagger.MorphoPipe.java

License:Open Source License

private static void output_XML(List<CoreLabel> tokens, PrintStream straume) throws IOException {
    PrintWriter w = new PrintWriter(straume);
    for (CoreLabel word : tokens) {
        String token = word.getString(TextAnnotation.class);
        if (token.contains("<s>"))
            continue;
        Word analysis = word.get(LVMorphologyAnalysis.class);
        Wordform maxwf = analysis.getMatchingWordform(word.getString(AnswerAnnotation.class), false);
        if (mini_tag)
            maxwf.removeNonlexicalAttributes();
        maxwf.addAttribute("Tag", maxwf.getTag());
        maxwf.toXML(w);//from  w ww.  ja  v a  2 s. c  om
        //         if (maxwf != null)
        //            tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"%s\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(maxwf.getTag()), JSONValue.escape(maxwf.getValue(AttributeNames.i_Lemma))));
        //         else 
        //            tokenJSON.add(String.format("{\"Word\":\"%s\",\"Tag\":\"-\",\"Lemma\":\"%s\"}", JSONValue.escape(token), JSONValue.escape(token)));         
    }
    w.flush();
}