Example usage for edu.stanford.nlp.ling CoreLabel CoreLabel

List of usage examples for edu.stanford.nlp.ling CoreLabel CoreLabel

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling CoreLabel CoreLabel.

Prototype

public CoreLabel() 

Source Link

Document

Default constructor, calls super()

Usage

From source file:conditionalCFG.ConditionalCFGParser.java

License:Open Source License

private CoreLabel getCoreLabel(int labelIndex) {
    if (originalCoreLabels[labelIndex] != null) {
        CoreLabel terminalLabel = originalCoreLabels[labelIndex];
        if (terminalLabel.value() == null && terminalLabel.word() != null) {
            terminalLabel.setValue(terminalLabel.word());
        }/*from w w w .j a  v a 2 s .  c  om*/
        return terminalLabel;
    }

    String wordStr = wordIndex.get(words[labelIndex]);
    CoreLabel terminalLabel = new CoreLabel();
    terminalLabel.setValue(wordStr);
    terminalLabel.setWord(wordStr);
    terminalLabel.setBeginPosition(beginOffsets[labelIndex]);
    terminalLabel.setEndPosition(endOffsets[labelIndex]);
    if (originalTags[labelIndex] != null) {
        terminalLabel.setTag(originalTags[labelIndex].tag());
    }
    return terminalLabel;
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordCoreferenceResolver.java

License:Open Source License

@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
    modelProvider.configure(aJCas.getCas());

    List<Tree> trees = new ArrayList<Tree>();
    List<CoreMap> sentences = new ArrayList<CoreMap>();
    List<List<CoreLabel>> sentenceTokens = new ArrayList<List<CoreLabel>>();
    for (ROOT root : select(aJCas, ROOT.class)) {
        // Copy all relevant information from the tokens
        List<CoreLabel> tokens = new ArrayList<CoreLabel>();
        for (Token token : selectCovered(Token.class, root)) {
            tokens.add(tokenToWord(token));
        }/*from  www. ja  va  2  s.c om*/
        sentenceTokens.add(tokens);

        // SemanticHeadFinder (nonTerminalInfo) does not know about PRN0, so we have to replace
        // it with PRN to avoid NPEs.
        TreeFactory tFact = new LabeledScoredTreeFactory(CoreLabel.factory()) {
            @Override
            public Tree newTreeNode(String aParent, List<Tree> aChildren) {
                String parent = aParent;
                if ("PRN0".equals(parent)) {
                    parent = "PRN";
                }
                Tree node = super.newTreeNode(parent, aChildren);
                return node;
            }
        };

        // deep copy of the tree. These are modified inside coref!
        Tree treeCopy = TreeUtils.createStanfordTree(root, tFact).treeSkeletonCopy();
        treeCopy.indexSpans();
        trees.add(treeCopy);

        // Build the sentence
        CoreMap sentence = new CoreLabel();
        sentence.set(TreeAnnotation.class, treeCopy);
        sentence.set(TokensAnnotation.class, tokens);
        sentence.set(RootKey.class, root);
        sentences.add(sentence);

        // https://code.google.com/p/dkpro-core-asl/issues/detail?id=590
        // We currently do not copy over dependencies from the CAS. This is supposed to fill
        // in the dependencies so we do not get NPEs.
        TreebankLanguagePack tlp = new PennTreebankLanguagePack();
        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(tlp.punctuationWordRejectFilter(),
                tlp.typedDependencyHeadFinder());
        ParserAnnotatorUtils.fillInParseAnnotations(false, true, gsf, sentence, treeCopy,
                GrammaticalStructure.Extras.NONE);

        // https://code.google.com/p/dkpro-core-asl/issues/detail?id=582
        SemanticGraph deps = sentence.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
        for (IndexedWord vertex : deps.vertexSet()) {
            vertex.setWord(vertex.value());
        }

        // merge the new CoreLabels with the tree leaves
        MentionExtractor.mergeLabels(treeCopy, tokens);
        MentionExtractor.initializeUtterance(tokens);
    }

    Annotation document = new Annotation(aJCas.getDocumentText());
    document.set(SentencesAnnotation.class, sentences);

    Coreferencer coref = modelProvider.getResource();

    // extract all possible mentions
    // Reparsing only works when the full CoreNLP pipeline system is set up! Passing false here
    // disables reparsing.
    RuleBasedCorefMentionFinder finder = new RuleBasedCorefMentionFinder(false);
    List<List<Mention>> allUnprocessedMentions = finder.extractPredictedMentions(document, 0,
            coref.corefSystem.dictionaries());

    // add the relevant info to mentions and order them for coref
    Map<Integer, CorefChain> result;
    try {
        Document doc = coref.mentionExtractor.arrange(document, sentenceTokens, trees, allUnprocessedMentions);
        result = coref.corefSystem.coref(doc);
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }

    for (CorefChain chain : result.values()) {
        CoreferenceLink last = null;
        for (CorefMention mention : chain.getMentionsInTextualOrder()) {
            CoreLabel beginLabel = sentences.get(mention.sentNum - 1).get(TokensAnnotation.class)
                    .get(mention.startIndex - 1);
            CoreLabel endLabel = sentences.get(mention.sentNum - 1).get(TokensAnnotation.class)
                    .get(mention.endIndex - 2);
            CoreferenceLink link = new CoreferenceLink(aJCas, beginLabel.get(TokenKey.class).getBegin(),
                    endLabel.get(TokenKey.class).getEnd());

            if (mention.mentionType != null) {
                link.setReferenceType(mention.mentionType.toString());
            }

            if (last == null) {
                // This is the first mention. Here we'll initialize the chain
                CoreferenceChain corefChain = new CoreferenceChain(aJCas);
                corefChain.setFirst(link);
                corefChain.addToIndexes();
            } else {
                // For the other mentions, we'll add them to the chain.
                last.setNext(link);
            }
            last = link;

            link.addToIndexes();
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordDependencyConverter.java

License:Open Source License

@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException {
    String lang = language != null ? language : aJCas.getDocumentLanguage();

    if (!languagePacks.containsKey(lang)) {
        throw new AnalysisEngineProcessException(
                new IllegalStateException("Unsupported language [" + aJCas.getDocumentLanguage() + "]"));
    }//from   w  w  w .j  ava2s.c  om

    TreebankLanguagePack lp;
    try {
        lp = languagePacks.get(aJCas.getDocumentLanguage()).newInstance();
    } catch (InstantiationException | IllegalAccessException e) {
        throw new AnalysisEngineProcessException(e);
    }

    List<CoreMap> sentences = new ArrayList<CoreMap>();
    for (ROOT root : select(aJCas, ROOT.class)) {
        // Copy all relevant information from the tokens
        List<Token> tokens = selectCovered(Token.class, root);
        List<CoreLabel> coreTokens = new ArrayList<CoreLabel>();
        for (Token token : tokens) {
            coreTokens.add(tokenToWord(token));
        }

        // SemanticHeadFinder (nonTerminalInfo) does not know about PRN0, so we have to replace
        // it with PRN to avoid NPEs.
        TreeFactory tFact = new LabeledScoredTreeFactory(CoreLabel.factory()) {
            @Override
            public Tree newTreeNode(String aParent, List<Tree> aChildren) {
                String parent = aParent;
                if ("PRN0".equals(parent)) {
                    parent = "PRN";
                }
                Tree node = super.newTreeNode(parent, aChildren);
                return node;
            }
        };

        Tree tree = TreeUtils.createStanfordTree(root, tFact);
        Trees.convertToCoreLabels(tree);
        tree.indexSpans();

        // Build the sentence
        CoreMap sentence = new CoreLabel();
        sentence.set(TreeAnnotation.class, tree);
        sentence.set(TokensAnnotation.class, coreTokens);
        sentence.set(RootKey.class, root);
        sentences.add(sentence);

        doCreateDependencyTags(aJCas, lp, tree, tokens);
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.java

License:Open Source License

@Override
protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException {
    List<Token> casTokens = null;

    // Use value from language parameter, document language or fallback language - whatever
    // is available
    String language = getLanguage(aJCas);

    if (isWriteToken()) {
        casTokens = new ArrayList<Token>();
        final String text = aText;
        final Tokenizer<?> tokenizer = getTokenizer(language, aText);
        int offsetInSentence = 0;

        List<?> tokens = tokenizer.tokenize();
        outer: for (int i = 0; i < tokens.size(); i++) {
            final Object token = tokens.get(i);
            // System.out.println("Token class: "+token.getClass());
            String t = null;/*  w  w  w.j  a  va2 s .c  o m*/
            if (token instanceof String) {
                t = (String) token;
            }
            if (token instanceof CoreLabel) {
                CoreLabel l = (CoreLabel) token;
                t = l.word();
                int begin = l.get(CharacterOffsetBeginAnnotation.class);
                int end = l.get(CharacterOffsetEndAnnotation.class);

                casTokens.add(createToken(aJCas, aZoneBegin + begin, aZoneBegin + end, i));
                offsetInSentence = end;
                continue;
            }
            if (token instanceof Word) {
                Word w = (Word) token;
                t = w.word();
            }

            if (t == null) {
                throw new AnalysisEngineProcessException(
                        new IllegalStateException("Unknown token type: " + token.getClass()));
            }

            // Skip whitespace
            while (isWhitespace(text.charAt(offsetInSentence))) {
                offsetInSentence++;
                if (offsetInSentence >= text.length()) {
                    break outer;
                }
            }

            // Match
            if (text.startsWith(t, offsetInSentence)) {
                casTokens.add(createToken(aJCas, aZoneBegin + offsetInSentence,
                        aZoneBegin + offsetInSentence + t.length(), i));
                offsetInSentence = offsetInSentence + t.length();
            } else {
                //                    System.out.println(aText);
                throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: ["
                        + t + "] CAS: ["
                        + text.substring(offsetInSentence, min(offsetInSentence + t.length(), text.length()))));
            }
        }
    }

    if (isWriteSentence()) {
        if (casTokens == null) {
            casTokens = selectCovered(aJCas, Token.class, aZoneBegin, aZoneBegin + aText.length());
        }

        // Prepare the tokens for processing by WordToSentenceProcessor
        List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>();
        for (Token token : casTokens) {
            CoreLabel l = new CoreLabel();
            l.set(CharacterOffsetBeginAnnotation.class, token.getBegin());
            l.set(CharacterOffsetEndAnnotation.class, token.getEnd());
            l.setWord(token.getCoveredText());
            tokensInDocument.add(l);
        }

        // The sentence splitter (probably) requires the escaped text, so we prepare it here
        PTBEscapingProcessor escaper = new PTBEscapingProcessor();
        escaper.apply(tokensInDocument);

        // Apply the WordToSentenceProcessor to find the sentence boundaries
        WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex,
                boundaryFollowers, boundariesToDiscard, xmlBreakElementsToDiscard, regionElementRegex,
                newlineIsSentenceBreak, null, tokenRegexesToDiscard, isOneSentence, allowEmptySentences);

        List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument);
        for (List<CoreLabel> sentence : sentencesInDocument) {
            int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class);
            int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class);

            createSentence(aJCas, begin, end);
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils.java

License:Open Source License

public static CoreLabel tokenToWord(Token aToken) {
    CoreLabel t = new CoreLabel();

    t.setOriginalText(aToken.getCoveredText());
    t.setWord(aToken.getCoveredText());//from  w  w w .j  ava  2s .  com
    t.setBeginPosition(aToken.getBegin());
    t.setEndPosition(aToken.getEnd());

    if (aToken.getLemma() != null) {
        t.setLemma(aToken.getLemma().getValue());
    }

    if (aToken.getPos() != null) {
        t.setTag(aToken.getPos().getPosValue());
    }

    return t;
}

From source file:edu.cmu.ml.rtw.users.ssrivastava.RegexExtractor.java

public static CoreMap getStanfordSentence(DocumentNLP document, int sentIdx) {
    List<String> words = document.getSentenceTokenStrs(sentIdx);
    List<PoSTag> posTags = document.getSentencePoSTags(sentIdx);

    List<CoreLabel> tokenList = new ArrayList<CoreLabel>();
    for (int i = 0; i < words.size(); i++) {
        /*Re-create Stanford tokens*/
        CoreLabel token = new CoreLabel();
        token.setWord(words.get(i));/*from   ww w  .  ja v a 2  s. c  o  m*/
        token.setTag(posTags.get(i).toString());
        token.setNER("O");
        token.setDocID(document.getName());
        token.setSentIndex(sentIdx);
        token.setBeginPosition(document.getToken(sentIdx, i).getCharSpanStart());
        token.setEndPosition(document.getToken(sentIdx, i).getCharSpanEnd());

        //System.out.println(token.word()+" "+token.beginPosition()+" "+token.endPosition());
        tokenList.add(token);
    }

    //Add NER labels for sentence
    List<Pair<TokenSpan, String>> ners = document.getNer(sentIdx);
    for (Pair<TokenSpan, String> p : ners) {
        for (int k = p.getFirst().getStartTokenIndex(); k < p.getFirst().getEndTokenIndex(); k++) {
            tokenList.get(k).setNER(p.getSecond());
        }
    }

    //Convert to Stanford Sentence
    CoreMap sentence = new ArrayCoreMap();
    sentence.set(TokensAnnotation.class, tokenList);
    sentence.set(CharacterOffsetBeginAnnotation.class, tokenList.get(0).beginPosition());
    sentence.set(CharacterOffsetEndAnnotation.class, tokenList.get(words.size() - 1).endPosition());
    return sentence;
}

From source file:gate.stanford.NER.java

License:Open Source License

@Override
public void execute() throws ExecutionException {
    // check the parameters
    if (document == null)
        throw new ExecutionException("No document to process!");

    AnnotationSet inputAS = document.getAnnotations(inputASName);
    AnnotationSet outputAS = document.getAnnotations(outputASName);

    if (baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length() == 0) {
        throw new ExecutionException("No base Token Annotation Type provided!");
    }//from w w w.j av  a  2s . com

    if (baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length() == 0) {
        throw new ExecutionException("No base Sentence Annotation Type provided!");
    }

    AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType);
    AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType);
    if (sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null && tokensAS.size() > 0) {
        long startTime = System.currentTimeMillis();
        fireStatusChanged("NER searching " + document.getName());
        fireProgressChanged(0);

        // prepare the input for CRFClassifier
        List<CoreLabel> sentenceForTagger = new ArrayList<CoreLabel>();

        // define a comparator for annotations by start offset
        OffsetComparator offsetComparator = new OffsetComparator();

        // read all the tokens and all the sentences
        List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS);
        Collections.sort(sentencesList, offsetComparator);
        List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS);
        Collections.sort(tokensList, offsetComparator);

        Iterator<Annotation> sentencesIter = sentencesList.iterator();
        ListIterator<Annotation> tokensIter = tokensList.listIterator();

        List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>();
        Annotation currentToken = tokensIter.next();
        int sentIndex = 0;
        int sentCnt = sentencesAS.size();

        // go through sentence annotations in the document
        while (sentencesIter.hasNext()) {
            Annotation currentSentence = sentencesIter.next();

            // reset sentence-level processing variables
            tokensInCurrentSentence.clear();
            sentenceForTagger.clear();

            // while we have sane tokens
            while (currentToken != null && currentToken.getEndNode().getOffset()
                    .compareTo(currentSentence.getEndNode().getOffset()) <= 0) {

                // If we're only labelling Tokens within baseSentenceAnnotationType,
                // don't add the sentence if the Tokens aren't within the span of
                // baseSentenceAnnotationType
                if (currentToken.withinSpanOf(currentSentence)) {
                    tokensInCurrentSentence.add(currentToken);

                    // build a stanford nlp representation of the token and add it to the sequence
                    CoreLabel currentLabel = new CoreLabel();
                    currentLabel.setWord((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME));

                    sentenceForTagger.add(currentLabel);
                }
                currentToken = (tokensIter.hasNext() ? tokensIter.next() : null);
            }

            // if the sentence doesn't contain any tokens (which is a bit weird but
            // is possible) then don't try running the labeller
            if (sentenceForTagger.isEmpty())
                continue;

            // run the labeller
            List<CoreLabel> taggerResults = tagger.classifySentence(sentenceForTagger);

            // add the results
            // make sure no malfunction occurred
            if (taggerResults.size() != tokensInCurrentSentence.size())
                throw new ExecutionException("NER labeller malfunction: the output size ("
                        + taggerResults.size() + ") is different from the input size ("
                        + tokensInCurrentSentence.size() + ")!");

            // proceed through the annotated sequence
            Iterator<CoreLabel> resIter = taggerResults.iterator();
            Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator();

            String previousLabel = outsideLabel;
            Long previousEnd = new Long(-1);
            Long entityStart = new Long(-1);
            Long entityEnd = new Long(-1);

            Annotation annot;
            String nerLabel = "";

            while (resIter.hasNext()) {

                // for each labelled token..
                annot = tokIter.next();
                CoreLabel word = resIter.next();
                nerLabel = word.get(CoreAnnotations.AnswerAnnotation.class);

                // falling edge transition: entity ends
                // guard against this triggering at document start
                if (!nerLabel.equals(previousLabel) && !previousLabel.equals(outsideLabel)
                        && entityStart != -1) {

                    //            System.out.println("falling edge");
                    // get final bound; add new annotation in output AS
                    try {
                        outputAS.add(entityStart, previousEnd, previousLabel, new SimpleFeatureMapImpl());
                    } catch (InvalidOffsetException e) {
                        System.out.println("Token alignment problem:" + e);
                    }

                }

                // rising edge transition: entity starts
                if (!nerLabel.equals(previousLabel) && !nerLabel.equals(outsideLabel)) {
                    //            System.out.println("rising edge");
                    entityStart = annot.getStartNode().getOffset();
                }
                //          System.out.println(word.word() + "/" + nerLabel);

                previousLabel = nerLabel;
                previousEnd = annot.getEndNode().getOffset();

            }

            // clean up, in case last token in sentence was in an entity
            if (!nerLabel.equals(outsideLabel)) {
                try {
                    outputAS.add(entityStart, previousEnd, previousLabel, new SimpleFeatureMapImpl());
                } catch (InvalidOffsetException e) {
                    System.out.println("Token alignment problem:" + e);
                }
            }

            fireProgressChanged(sentIndex++ * 100 / sentCnt);

        }

        fireProcessFinished();
        fireStatusChanged(document.getName() + " tagged in "
                + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000)
                + " seconds!");
    } else {
        if (failOnMissingInputAnnotations) {
            throw new ExecutionException("No sentences or tokens to process in document " + document.getName()
                    + "\n" + "Please run a sentence splitter " + "and tokeniser first!");
        } else {
            Utils.logOnce(logger, Level.INFO,
                    "NE labeller: no sentence or token annotations in input document - see debug log for details.");
            logger.debug("No input annotations in document " + document.getName());
        }
    }

}

From source file:lv.lnb.ner.tagFolders.java

License:Open Source License

private static int processFile(AbstractSequenceClassifier<CoreLabel> classifier, String filename, String doc_id,
        Writer writer) {//from   w  ww.  ja v  a 2s.  c o m

    NECounterSingleDoc counter = new NECounterSingleDoc(doc_id);

    List<CoreLabel> document = new ArrayList<CoreLabel>();
    int i = 0;
    try {
        BufferedReader ieeja = new BufferedReader(
                new InputStreamReader(new FileInputStream(filename), "UTF-8"));
        String line;
        while ((line = ieeja.readLine()) != null) {
            i++;
            if (line.contains("<doc") || line.contains("</doc>") || line.contains("<page")
                    || line.contains("</page>") || line.contains("<p>"))
                continue;

            String[] info = line.split("\t");
            CoreLabel word = new CoreLabel();

            if (line.contains("</p>")) {
                word.set(TextAnnotation.class, "<p/>");
                word.set(LemmaAnnotation.class, "<p/>");
                word.set(PartOfSpeechAnnotation.class, "-");
                continue;
            } else if (line.contains("<g />")) {
                word.set(TextAnnotation.class, "<g/>");
                word.set(LemmaAnnotation.class, "<g/>");
                word.set(PartOfSpeechAnnotation.class, "-");
                continue;
            } else if (info.length < 3) {
                System.err.printf("%d @ %s:%s", i, filename, line);
            } else {
                word.set(TextAnnotation.class, info[0]);
                word.set(LemmaAnnotation.class, info[1]);
                word.set(PartOfSpeechAnnotation.class, info[2].substring(0, 1));
            }
            document.add(word);
        }
        ieeja.close();
    } catch (IOException e1) {
        // TODO Auto-generated catch block
        e1.printStackTrace();
    }

    List<CoreLabel> out = classifier.classify(document);
    String prevtag = "";
    String name_part = "";
    String lemma_part = "";
    for (CoreLabel word : out) {
        String tag = word.get(AnswerAnnotation.class);
        if (tag.length() < 2)
            tag = "";
        if (!tag.equalsIgnoreCase(prevtag)) {
            if (!prevtag.equalsIgnoreCase(""))
                counter.add(doc_id, name_part, lemma_part, prevtag);
            if (!tag.equalsIgnoreCase("")) {
                name_part = word.word();
                lemma_part = word.get(LemmaAnnotation.class);
            }
        } else if (!tag.equalsIgnoreCase("")) {
            name_part = name_part + " " + word.word();
            lemma_part = lemma_part + " " + word.get(LemmaAnnotation.class);
        }

        prevtag = tag;
    }
    try {
        counter.db_insert(writer);
    } catch (Exception e) {
        e.printStackTrace();
    }

    return i;
}

From source file:lv.lumii.morphotagger.MorphoPipe.java

License:Open Source License

public static List<List<CoreLabel>> readCONLL(BufferedReader in) throws IOException {
    String s;/*from w ww  . java 2  s . c o  m*/
    List<CoreLabel> sentence = new LinkedList<CoreLabel>();
    List<List<CoreLabel>> result = new LinkedList<List<CoreLabel>>();

    CoreLabel stag = new CoreLabel();
    stag.set(TextAnnotation.class, "<s>");
    sentence.add(stag);

    while ((s = in.readLine()) != null) {
        if (s.trim().length() > 0) {
            String[] fields = s.split("\t");
            String token = fields[1];
            if (!token.equalsIgnoreCase("_"))
                token = token.replace('_', ' ');
            String extraColumns = "";
            if (saveColumns) {
                for (int field_i = 6; field_i < fields.length; field_i++)
                    extraColumns += fields[field_i] + "\t";
                extraColumns.trim();
            }
            String syntax = "";
            if (fields.length >= 10)
                syntax = fields[6] + "\t" + fields[7] + "\t" + fields[8] + "\t" + fields[9];

            CoreLabel word = new CoreLabel();
            word.set(TextAnnotation.class, token);
            word.set(ParentAnnotation.class, syntax);
            word.set(ExtraColumnAnnotation.class, extraColumns);
            sentence.add(word);
        } else {
            stag = new CoreLabel();
            stag.set(TextAnnotation.class, "<s>");
            sentence.add(stag);

            result.add(LVMorphologyReaderAndWriter.analyzeLabels(sentence));

            sentence = new LinkedList<CoreLabel>();
            stag = new CoreLabel();
            stag.set(TextAnnotation.class, "<s>");
            sentence.add(stag);
        }
    }
    if (sentence.size() > 0) {
        stag = new CoreLabel();
        stag.set(TextAnnotation.class, "<s>");
        sentence.add(stag);
        result.add(LVMorphologyReaderAndWriter.analyzeLabels(sentence));
    }

    return result;
}

From source file:lv.pipe.MorphoTagger.java

License:Open Source License

public Annotation processSentence(Annotation sentence) {
    if (sentence.has(LabelTokens.class)) {
        List<Annotation> tokens = sentence.get(LabelTokens.class);
        // This is not working returns all "xf":
        // List<Word> sent = new ArrayList<Word>(tokens.size());
        // for (Annotation token : tokens) {
        // String word = token.get(TextLabel.class);
        // sent.add(new Word(word));
        // }//from w ww .j  ava2s.  c  o m
        // List<CoreLabel> coreLabels =
        // LVMorphologyReaderAndWriter.analyzeSentence2(sent);
        List<CoreLabel> sent = new ArrayList<CoreLabel>(tokens.size());
        for (Annotation token : tokens) {
            String word = token.get(LabelText.class);
            CoreLabel wi = new CoreLabel();
            wi.setWord(word);
            sent.add(wi);
        }
        CoreLabel sEnd = new CoreLabel();
        sEnd.setWord("<s>");
        sent.add(sEnd);
        List<CoreLabel> coreLabels = LVMorphologyReaderAndWriter.analyzeLabels(sent);

        morphoClassifier.classify(coreLabels);
        sentence.remove(LabelTokens.class);
        List<Annotation> tLabels = new ArrayList<Annotation>(coreLabels.size());
        int counter = 1;
        for (CoreLabel w : coreLabels) {
            Annotation tLabel = new Annotation();
            String token = w.getString(TextAnnotation.class);
            // token = token.replace(' ', '_');
            if (token.contains("<s>"))
                continue;
            tLabel.setText(token);
            tLabel.set(LabelIndex.class, counter++);

            Word analysis = w.get(LVMorphologyAnalysis.class);
            Wordform mainwf = analysis.getMatchingWordform(w.getString(AnswerAnnotation.class), false);

            if (mainwf != null) {
                String lemma = mainwf.getValue(AttributeNames.i_Lemma);
                // lemma = lemma.replace(' ', '_');
                if (lemma == null || lemma.trim().isEmpty()) {
                    lemma = "_";
                    log.log(Level.SEVERE, "Empty lemma for {0}", token);
                }
                tLabel.setLemma(lemma);

                String answer = w.getString(AnswerAnnotation.class);
                if (answer == null || answer.trim().isEmpty()) {
                    answer = "_"; // no empty tag
                    log.log(Level.SEVERE, "Empty simple pos tag for {0}", token);
                }
                tLabel.set(LabelPosTagSimple.class, answer);

                String posTag = mainwf.getTag();
                if (posTag == null || posTag.trim().isEmpty()) {
                    posTag = "_";
                    log.log(Level.SEVERE, "Empty pos tag for {0}", token);
                }
                tLabel.set(LabelPosTag.class, posTag);

                // Feature atribtu filtri
                if (MINI_TAG)
                    mainwf.removeNonlexicalAttributes();
                if (LETA_FEATURES) {
                    addLETAfeatures(mainwf);
                    // mainwf.removeAttribute(AttributeNames.i_SourceLemma);
                    // FIXME - atvasin?tiem v?rdiem is var bt svargs,
                    // atpriedekotas lemmas..
                    mainwf.removeTechnicalAttributes();
                }

                // v?rda f?as
                StringBuilder s = mainwf.pipeDelimitedEntries();
                if (FEATURES) {
                    // visas f?as, ko lietoja trenjot
                    Datum<String, String> d = morphoClassifier.makeDatum(coreLabels, counter,
                            morphoClassifier.featureFactory);

                    for (String feature : d.asFeatures()) {
                        // noemam trailing |C, kas t?m f??m tur ir
                        s.append(feature.substring(0, feature.length() - 2).replace(' ', '_'));
                        s.append('|');
                    }
                }
                // noemam peedeejo | separatoru, kas ir lieks
                s.deleteCharAt(s.length() - 1);
                s.append('\t');
                String morphoFeatures = s.toString();
                tLabel.set(LabelMorphoFeatures.class, morphoFeatures);

            } else {
                log.log(Level.SEVERE, "Empty main word form for {0}", token);
            }
            tLabels.add(tLabel);
        }
        sentence.set(LabelTokens.class, tLabels);
    }
    return sentence;
}