Example usage for edu.stanford.nlp.ling CoreLabel setWord

List of usage examples for edu.stanford.nlp.ling CoreLabel setWord

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling CoreLabel setWord.

Prototype

@Override
public void setWord(String word) 

Source Link

Document

Set the word value for the label.

Usage

From source file:conditionalCFG.ConditionalCFGParser.java

License:Open Source License

private CoreLabel getCoreLabel(int labelIndex) {
    if (originalCoreLabels[labelIndex] != null) {
        CoreLabel terminalLabel = originalCoreLabels[labelIndex];
        if (terminalLabel.value() == null && terminalLabel.word() != null) {
            terminalLabel.setValue(terminalLabel.word());
        }//from   w  w w  .  j  a v a 2  s .  c o m
        return terminalLabel;
    }

    String wordStr = wordIndex.get(words[labelIndex]);
    CoreLabel terminalLabel = new CoreLabel();
    terminalLabel.setValue(wordStr);
    terminalLabel.setWord(wordStr);
    terminalLabel.setBeginPosition(beginOffsets[labelIndex]);
    terminalLabel.setEndPosition(endOffsets[labelIndex]);
    if (originalTags[labelIndex] != null) {
        terminalLabel.setTag(originalTags[labelIndex].tag());
    }
    return terminalLabel;
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.java

License:Open Source License

@Override
protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException {
    List<Token> casTokens = null;

    // Use value from language parameter, document language or fallback language - whatever
    // is available
    String language = getLanguage(aJCas);

    if (isWriteToken()) {
        casTokens = new ArrayList<Token>();
        final String text = aText;
        final Tokenizer<?> tokenizer = getTokenizer(language, aText);
        int offsetInSentence = 0;

        List<?> tokens = tokenizer.tokenize();
        outer: for (int i = 0; i < tokens.size(); i++) {
            final Object token = tokens.get(i);
            // System.out.println("Token class: "+token.getClass());
            String t = null;//from   w w w. ja v  a2s.com
            if (token instanceof String) {
                t = (String) token;
            }
            if (token instanceof CoreLabel) {
                CoreLabel l = (CoreLabel) token;
                t = l.word();
                int begin = l.get(CharacterOffsetBeginAnnotation.class);
                int end = l.get(CharacterOffsetEndAnnotation.class);

                casTokens.add(createToken(aJCas, aZoneBegin + begin, aZoneBegin + end, i));
                offsetInSentence = end;
                continue;
            }
            if (token instanceof Word) {
                Word w = (Word) token;
                t = w.word();
            }

            if (t == null) {
                throw new AnalysisEngineProcessException(
                        new IllegalStateException("Unknown token type: " + token.getClass()));
            }

            // Skip whitespace
            while (isWhitespace(text.charAt(offsetInSentence))) {
                offsetInSentence++;
                if (offsetInSentence >= text.length()) {
                    break outer;
                }
            }

            // Match
            if (text.startsWith(t, offsetInSentence)) {
                casTokens.add(createToken(aJCas, aZoneBegin + offsetInSentence,
                        aZoneBegin + offsetInSentence + t.length(), i));
                offsetInSentence = offsetInSentence + t.length();
            } else {
                //                    System.out.println(aText);
                throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: ["
                        + t + "] CAS: ["
                        + text.substring(offsetInSentence, min(offsetInSentence + t.length(), text.length()))));
            }
        }
    }

    if (isWriteSentence()) {
        if (casTokens == null) {
            casTokens = selectCovered(aJCas, Token.class, aZoneBegin, aZoneBegin + aText.length());
        }

        // Prepare the tokens for processing by WordToSentenceProcessor
        List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>();
        for (Token token : casTokens) {
            CoreLabel l = new CoreLabel();
            l.set(CharacterOffsetBeginAnnotation.class, token.getBegin());
            l.set(CharacterOffsetEndAnnotation.class, token.getEnd());
            l.setWord(token.getCoveredText());
            tokensInDocument.add(l);
        }

        // The sentence splitter (probably) requires the escaped text, so we prepare it here
        PTBEscapingProcessor escaper = new PTBEscapingProcessor();
        escaper.apply(tokensInDocument);

        // Apply the WordToSentenceProcessor to find the sentence boundaries
        WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex,
                boundaryFollowers, boundariesToDiscard, xmlBreakElementsToDiscard, regionElementRegex,
                newlineIsSentenceBreak, null, tokenRegexesToDiscard, isOneSentence, allowEmptySentences);

        List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument);
        for (List<CoreLabel> sentence : sentencesInDocument) {
            int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class);
            int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class);

            createSentence(aJCas, begin, end);
        }
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.util.CoreNlpUtils.java

License:Open Source License

public static CoreLabel tokenToWord(Token aToken) {
    CoreLabel t = new CoreLabel();

    t.setOriginalText(aToken.getCoveredText());
    t.setWord(aToken.getCoveredText());
    t.setBeginPosition(aToken.getBegin());
    t.setEndPosition(aToken.getEnd());//from w ww  . ja  va2 s. co  m

    if (aToken.getLemma() != null) {
        t.setLemma(aToken.getLemma().getValue());
    }

    if (aToken.getPos() != null) {
        t.setTag(aToken.getPos().getPosValue());
    }

    return t;
}

From source file:edu.cmu.ml.rtw.users.ssrivastava.RegexExtractor.java

public static CoreMap getStanfordSentence(DocumentNLP document, int sentIdx) {
    List<String> words = document.getSentenceTokenStrs(sentIdx);
    List<PoSTag> posTags = document.getSentencePoSTags(sentIdx);

    List<CoreLabel> tokenList = new ArrayList<CoreLabel>();
    for (int i = 0; i < words.size(); i++) {
        /*Re-create Stanford tokens*/
        CoreLabel token = new CoreLabel();
        token.setWord(words.get(i));
        token.setTag(posTags.get(i).toString());
        token.setNER("O");
        token.setDocID(document.getName());
        token.setSentIndex(sentIdx);//from  w w w  .j  a  v  a  2  s  . c  o m
        token.setBeginPosition(document.getToken(sentIdx, i).getCharSpanStart());
        token.setEndPosition(document.getToken(sentIdx, i).getCharSpanEnd());

        //System.out.println(token.word()+" "+token.beginPosition()+" "+token.endPosition());
        tokenList.add(token);
    }

    //Add NER labels for sentence
    List<Pair<TokenSpan, String>> ners = document.getNer(sentIdx);
    for (Pair<TokenSpan, String> p : ners) {
        for (int k = p.getFirst().getStartTokenIndex(); k < p.getFirst().getEndTokenIndex(); k++) {
            tokenList.get(k).setNER(p.getSecond());
        }
    }

    //Convert to Stanford Sentence
    CoreMap sentence = new ArrayCoreMap();
    sentence.set(TokensAnnotation.class, tokenList);
    sentence.set(CharacterOffsetBeginAnnotation.class, tokenList.get(0).beginPosition());
    sentence.set(CharacterOffsetEndAnnotation.class, tokenList.get(words.size() - 1).endPosition());
    return sentence;
}

From source file:gate.stanford.NER.java

License:Open Source License

@Override
public void execute() throws ExecutionException {
    // check the parameters
    if (document == null)
        throw new ExecutionException("No document to process!");

    AnnotationSet inputAS = document.getAnnotations(inputASName);
    AnnotationSet outputAS = document.getAnnotations(outputASName);

    if (baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length() == 0) {
        throw new ExecutionException("No base Token Annotation Type provided!");
    }/*from  ww  w.  ja v  a 2 s .  c  om*/

    if (baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length() == 0) {
        throw new ExecutionException("No base Sentence Annotation Type provided!");
    }

    AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType);
    AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType);
    if (sentencesAS != null && sentencesAS.size() > 0 && tokensAS != null && tokensAS.size() > 0) {
        long startTime = System.currentTimeMillis();
        fireStatusChanged("NER searching " + document.getName());
        fireProgressChanged(0);

        // prepare the input for CRFClassifier
        List<CoreLabel> sentenceForTagger = new ArrayList<CoreLabel>();

        // define a comparator for annotations by start offset
        OffsetComparator offsetComparator = new OffsetComparator();

        // read all the tokens and all the sentences
        List<Annotation> sentencesList = new ArrayList<Annotation>(sentencesAS);
        Collections.sort(sentencesList, offsetComparator);
        List<Annotation> tokensList = new ArrayList<Annotation>(tokensAS);
        Collections.sort(tokensList, offsetComparator);

        Iterator<Annotation> sentencesIter = sentencesList.iterator();
        ListIterator<Annotation> tokensIter = tokensList.listIterator();

        List<Annotation> tokensInCurrentSentence = new ArrayList<Annotation>();
        Annotation currentToken = tokensIter.next();
        int sentIndex = 0;
        int sentCnt = sentencesAS.size();

        // go through sentence annotations in the document
        while (sentencesIter.hasNext()) {
            Annotation currentSentence = sentencesIter.next();

            // reset sentence-level processing variables
            tokensInCurrentSentence.clear();
            sentenceForTagger.clear();

            // while we have sane tokens
            while (currentToken != null && currentToken.getEndNode().getOffset()
                    .compareTo(currentSentence.getEndNode().getOffset()) <= 0) {

                // If we're only labelling Tokens within baseSentenceAnnotationType,
                // don't add the sentence if the Tokens aren't within the span of
                // baseSentenceAnnotationType
                if (currentToken.withinSpanOf(currentSentence)) {
                    tokensInCurrentSentence.add(currentToken);

                    // build a stanford nlp representation of the token and add it to the sequence
                    CoreLabel currentLabel = new CoreLabel();
                    currentLabel.setWord((String) currentToken.getFeatures().get(TOKEN_STRING_FEATURE_NAME));

                    sentenceForTagger.add(currentLabel);
                }
                currentToken = (tokensIter.hasNext() ? tokensIter.next() : null);
            }

            // if the sentence doesn't contain any tokens (which is a bit weird but
            // is possible) then don't try running the labeller
            if (sentenceForTagger.isEmpty())
                continue;

            // run the labeller
            List<CoreLabel> taggerResults = tagger.classifySentence(sentenceForTagger);

            // add the results
            // make sure no malfunction occurred
            if (taggerResults.size() != tokensInCurrentSentence.size())
                throw new ExecutionException("NER labeller malfunction: the output size ("
                        + taggerResults.size() + ") is different from the input size ("
                        + tokensInCurrentSentence.size() + ")!");

            // proceed through the annotated sequence
            Iterator<CoreLabel> resIter = taggerResults.iterator();
            Iterator<Annotation> tokIter = tokensInCurrentSentence.iterator();

            String previousLabel = outsideLabel;
            Long previousEnd = new Long(-1);
            Long entityStart = new Long(-1);
            Long entityEnd = new Long(-1);

            Annotation annot;
            String nerLabel = "";

            while (resIter.hasNext()) {

                // for each labelled token..
                annot = tokIter.next();
                CoreLabel word = resIter.next();
                nerLabel = word.get(CoreAnnotations.AnswerAnnotation.class);

                // falling edge transition: entity ends
                // guard against this triggering at document start
                if (!nerLabel.equals(previousLabel) && !previousLabel.equals(outsideLabel)
                        && entityStart != -1) {

                    //            System.out.println("falling edge");
                    // get final bound; add new annotation in output AS
                    try {
                        outputAS.add(entityStart, previousEnd, previousLabel, new SimpleFeatureMapImpl());
                    } catch (InvalidOffsetException e) {
                        System.out.println("Token alignment problem:" + e);
                    }

                }

                // rising edge transition: entity starts
                if (!nerLabel.equals(previousLabel) && !nerLabel.equals(outsideLabel)) {
                    //            System.out.println("rising edge");
                    entityStart = annot.getStartNode().getOffset();
                }
                //          System.out.println(word.word() + "/" + nerLabel);

                previousLabel = nerLabel;
                previousEnd = annot.getEndNode().getOffset();

            }

            // clean up, in case last token in sentence was in an entity
            if (!nerLabel.equals(outsideLabel)) {
                try {
                    outputAS.add(entityStart, previousEnd, previousLabel, new SimpleFeatureMapImpl());
                } catch (InvalidOffsetException e) {
                    System.out.println("Token alignment problem:" + e);
                }
            }

            fireProgressChanged(sentIndex++ * 100 / sentCnt);

        }

        fireProcessFinished();
        fireStatusChanged(document.getName() + " tagged in "
                + NumberFormat.getInstance().format((double) (System.currentTimeMillis() - startTime) / 1000)
                + " seconds!");
    } else {
        if (failOnMissingInputAnnotations) {
            throw new ExecutionException("No sentences or tokens to process in document " + document.getName()
                    + "\n" + "Please run a sentence splitter " + "and tokeniser first!");
        } else {
            Utils.logOnce(logger, Level.INFO,
                    "NE labeller: no sentence or token annotations in input document - see debug log for details.");
            logger.debug("No input annotations in document " + document.getName());
        }
    }

}

From source file:lv.pipe.MorphoTagger.java

License:Open Source License

public Annotation processSentence(Annotation sentence) {
    if (sentence.has(LabelTokens.class)) {
        List<Annotation> tokens = sentence.get(LabelTokens.class);
        // This is not working returns all "xf":
        // List<Word> sent = new ArrayList<Word>(tokens.size());
        // for (Annotation token : tokens) {
        // String word = token.get(TextLabel.class);
        // sent.add(new Word(word));
        // }//from   w w w .  j  ava2  s  .  co  m
        // List<CoreLabel> coreLabels =
        // LVMorphologyReaderAndWriter.analyzeSentence2(sent);
        List<CoreLabel> sent = new ArrayList<CoreLabel>(tokens.size());
        for (Annotation token : tokens) {
            String word = token.get(LabelText.class);
            CoreLabel wi = new CoreLabel();
            wi.setWord(word);
            sent.add(wi);
        }
        CoreLabel sEnd = new CoreLabel();
        sEnd.setWord("<s>");
        sent.add(sEnd);
        List<CoreLabel> coreLabels = LVMorphologyReaderAndWriter.analyzeLabels(sent);

        morphoClassifier.classify(coreLabels);
        sentence.remove(LabelTokens.class);
        List<Annotation> tLabels = new ArrayList<Annotation>(coreLabels.size());
        int counter = 1;
        for (CoreLabel w : coreLabels) {
            Annotation tLabel = new Annotation();
            String token = w.getString(TextAnnotation.class);
            // token = token.replace(' ', '_');
            if (token.contains("<s>"))
                continue;
            tLabel.setText(token);
            tLabel.set(LabelIndex.class, counter++);

            Word analysis = w.get(LVMorphologyAnalysis.class);
            Wordform mainwf = analysis.getMatchingWordform(w.getString(AnswerAnnotation.class), false);

            if (mainwf != null) {
                String lemma = mainwf.getValue(AttributeNames.i_Lemma);
                // lemma = lemma.replace(' ', '_');
                if (lemma == null || lemma.trim().isEmpty()) {
                    lemma = "_";
                    log.log(Level.SEVERE, "Empty lemma for {0}", token);
                }
                tLabel.setLemma(lemma);

                String answer = w.getString(AnswerAnnotation.class);
                if (answer == null || answer.trim().isEmpty()) {
                    answer = "_"; // no empty tag
                    log.log(Level.SEVERE, "Empty simple pos tag for {0}", token);
                }
                tLabel.set(LabelPosTagSimple.class, answer);

                String posTag = mainwf.getTag();
                if (posTag == null || posTag.trim().isEmpty()) {
                    posTag = "_";
                    log.log(Level.SEVERE, "Empty pos tag for {0}", token);
                }
                tLabel.set(LabelPosTag.class, posTag);

                // Feature atribtu filtri
                if (MINI_TAG)
                    mainwf.removeNonlexicalAttributes();
                if (LETA_FEATURES) {
                    addLETAfeatures(mainwf);
                    // mainwf.removeAttribute(AttributeNames.i_SourceLemma);
                    // FIXME - atvasin?tiem v?rdiem is var bt svargs,
                    // atpriedekotas lemmas..
                    mainwf.removeTechnicalAttributes();
                }

                // v?rda f?as
                StringBuilder s = mainwf.pipeDelimitedEntries();
                if (FEATURES) {
                    // visas f?as, ko lietoja trenjot
                    Datum<String, String> d = morphoClassifier.makeDatum(coreLabels, counter,
                            morphoClassifier.featureFactory);

                    for (String feature : d.asFeatures()) {
                        // noemam trailing |C, kas t?m f??m tur ir
                        s.append(feature.substring(0, feature.length() - 2).replace(' ', '_'));
                        s.append('|');
                    }
                }
                // noemam peedeejo | separatoru, kas ir lieks
                s.deleteCharAt(s.length() - 1);
                s.append('\t');
                String morphoFeatures = s.toString();
                tLabel.set(LabelMorphoFeatures.class, morphoFeatures);

            } else {
                log.log(Level.SEVERE, "Empty main word form for {0}", token);
            }
            tLabels.add(tLabel);
        }
        sentence.set(LabelTokens.class, tLabels);
    }
    return sentence;
}

From source file:lv.pipe.NerTagger.java

License:Open Source License

public static CoreLabel makeCoreLabel(Annotation a) {
    CoreLabel wi = new CoreLabel();
    if (!a.has(LabelText.class) || a.getText().equals(BOUNDARY)) {
        wi.setWord(BOUNDARY);
        wi.set(AnswerAnnotation.class, OTHER);
        wi.set(NamedEntityTagGoldAnnotation.class, OTHER);
        wi.setLemma("_");
    } else {/*from w  w  w.  j  a  va2  s.c  o m*/
        wi.setWord(a.getText());
    }
    wi.setIndex(a.get(LabelIndex.class, -1));
    wi.setLemma(a.get(LabelLemma.class, "_"));
    wi.set(LVFullTagAnnotation.class, a.get(LabelPosTag.class, "_"));
    wi.setTag(a.get(LabelPosTagSimple.class, "_"));
    wi.set(MorphologyFeatureStringAnnotation.class, a.get(LabelMorphoFeatures.class, "_"));
    wi.set(ParentAnnotation.class, Integer.toString((Integer) a.get(LabelParent.class, -1)));
    wi.set(LabelAnnotation.class, a.get(LabelDependency.class, "_"));
    return wi;
}

From source file:opendial.bn.values.RelationalVal.java

License:Open Source License

public int addNode(String value) {
    CoreLabel label = new CoreLabel();
    label.setWord(value);
    label.setValue(value);//from   ww w .  j  a va 2 s .  com
    IndexedWord fword = new IndexedWord(label);
    fword.setIndex(graph.size());
    graph.addVertex(fword);
    cachedHashCode = 0;
    return fword.index();
}

From source file:org.exist.xquery.corenlp.TrainClassifier.java

License:Open Source License

private Collection<List<CoreLabel>> readODSSpreadsheet(final String localFilePath) throws XPathException {
    Collection<List<CoreLabel>> documents = new ArrayList<>();
    List<CoreLabel> document = new ArrayList<>();

    //try (InputStream is = Files.newInputStream(tempInFile)) {
    try (InputStream is = uploadedFileBase64String != null ? uploadedFileBase64String.getInputStream()
            : new Resource(localFilePath).getInputStream()) {
        SpreadSheet spreadSheet = ODPackage.createFromStream(is, "UserAnnotatedDocument").getSpreadSheet();

        Sheet sheet = spreadSheet.getSheet(0);

        for (int i = 0; i < sheet.getRowCount(); i++) {
            CoreLabel row = new CoreLabel();
            String value1 = sheet.getValueAt(0, i).toString();
            String value2 = sheet.getValueAt(1, i).toString();

            row.setWord(value1);
            row.setNER(value2);/*from ww  w. j ava2  s.c  o m*/
            row.set(CoreAnnotations.AnswerAnnotation.class, value2);
            if (sheet.getColumnCount() > 2) {
                String value3 = sheet.getValueAt(2, i).toString();
                if (!"".equals(value3) && tagCol > -1) {
                    row.setTag(value3);
                }
            }

            if (!"".equals(value1)) {
                document.add(row);
            } else {
                documents.add(document);
                document = new ArrayList<>();
            }
        }
    } catch (IOException ioe) {
        throw new XPathException(this, "Error while reading spreadsheet document: " + ioe.getMessage(), ioe);
    }
    return documents;
}

From source file:org.exist.xquery.corenlp.TrainClassifier.java

License:Open Source License

private Collection<List<CoreLabel>> readXLSXSpreadsheet(final String localFilePath,
        final InputDocType inputFormat) throws XPathException {
    Workbook workbook = null;//  ww  w .  j  a v  a  2 s  .c  o  m
    Collection<List<CoreLabel>> documents = new ArrayList<>();
    List<CoreLabel> document = new ArrayList<>();
    String fileName = "localFilePath";
    String extraSuffix = (inputFormat != InputDocType.XLSX) ? "" : "x";
    //try (InputStream is = Files.newInputStream(tempInFile)) {
    try (InputStream is = uploadedFileBase64String == null ? uploadedFileBase64String.getInputStream()
            : new Resource(fileName + extraSuffix).getInputStream()) {
        if (inputFormat == InputDocType.XLSX) {
            workbook = new XSSFWorkbook(is);
        } else {
            workbook = new HSSFWorkbook(is);
        }
    } catch (FileNotFoundException fe) {
        LOG.error(fe);
    } catch (IOException ioe) {
        LOG.error(ioe);
        throw new XPathException(this, "Error while reading spreadsheet document: " + ioe.getMessage(), ioe);
    }
    org.apache.poi.ss.usermodel.Sheet sheet = workbook.getSheetAt(0);
    Row row;
    Cell cell;
    Iterator rows = sheet.rowIterator();
    while (rows.hasNext()) {
        CoreLabel tok = new CoreLabel();
        row = (Row) rows.next();
        Iterator cells = row.cellIterator();
        int cellPos = 0;
        while (cells.hasNext()) {
            cell = (Cell) cells.next();
            //if (cell.getCellType() == Cell.CELL_TYPE_STRING) {
            switch (cellPos) {
            case 0:
                tok.setWord(cell.getStringCellValue());
                break;
            case 1:
                tok.setNER(cell.getStringCellValue());
                tok.set(CoreAnnotations.AnswerAnnotation.class, cell.getStringCellValue());
                break;
            case 2:
                tok.setTag(cell.getStringCellValue());
                break;
            default:
                break;
            }
            //} else if(cell.getCellType() == Cell.CELL_TYPE_NUMERIC) {
            //LOG.error("Cell has numeric value:" + cell.getNumericCellValue());
            //}
            cellPos++;
        }
        if (!"".equals(tok.word())) {
            document.add(tok);
        } else {
            documents.add(document);
            document = new ArrayList<>();
        }
    }
    return documents;
}