Example usage for edu.stanford.nlp.ling CoreLabel originalText

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling CoreLabel originalText.

Prototype

@Override
public String originalText()

Source Link

Usage

From source file:com.project.NLP.Requirement.ParserTreeGenerator.java

/**
 * nameEntityAnnotation for track the Location and Person name 
 * Return the word if the tokens contains Location, person, organization,misc, time, money, percent, date
 * //  w w  w  .j a  v  a  2 s  .  co m
 * @return arrayList
 */
public ArrayList generateNamedEntityTagAnnotation() {
    sentences = document.get(SentencesAnnotation.class);
    ArrayList nameEntity = new ArrayList();
    String annotations = "";
    for (CoreMap sentence : sentences) {

        for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
            // this is the NER label of the token
            annotations = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
            if (annotations.equals("LOCATION") || annotations.equals("PERSON")
                    || annotations.equals("ORGANIZATION") || annotations.equals("MISC")
                    || annotations.equals("TIME") || annotations.equals("MONEY")
                    || annotations.equals("PERCENT") || annotations.equals("DATE")) {
                nameEntity.add(token.originalText());
            }

        }
    }
    return nameEntity;

}

From source file:de.uni_leipzig.informatik.pcai042.boa.gui.goldstandard.BoaSentence.java

License:Open Source License

/**
 * Tokenizes the sentence and generates an XML representation.
 * // w  w  w  . j  a  v a  2 s .  c om
 * @param sentence
 * @throws IllegalArgumentException
 *             thrown when StanfordCoreNLP couldn't tokenize the sentence
 *             properly
 */
public BoaSentence(String sentence) throws IllegalArgumentException {
    this.sentence = sentence;
    annotations = new ArrayList<BoaAnnotation>();
    tokens = new ArrayList<String>();

    // double checked locking
    if (pipeline == null)
        initPipeline();
    synchronized (pipeline) {
        // generate tokens with StanfordCoreNLP
        Annotation document = new Annotation(sentence);
        pipeline.annotate(document);

        List<CoreMap> sentences = document.get(SentencesAnnotation.class);

        // since we always deal with single sentences, there can be only one
        // sentence in the output
        // otherwise StanfordCoreNLP had problems to tokenize the sentence
        if (sentences.size() != 1) {
            throw new IllegalArgumentException();
        }

        for (CoreLabel token : sentences.get(0).get(TokensAnnotation.class)) {
            // this is the text of the token
            String word = token.originalText();
            tokens.add(word);
        }
        xmlDoc = pipeline.annotationToDoc(document);
    }
}

From source file:de.uni_leipzig.informatik.pcai042.boa.manager.BoaSentence.java

License:Open Source License

/**
 * Creates a sentence from a CoreMap returned by a {@link Tokenizer}.
 * /*from  w ww  .  j  a v  a2  s .c o  m*/
 * @param sentence
 *            the original text of the sentence
 * @param coreMap
 *            the CoreMap
 */
public BoaSentence(CoreMap coreMap) {
    sentence = coreMap.get(CoreAnnotations.TextAnnotation.class);
    tokens = new ArrayList<String>(coreMap.get(TokensAnnotation.class).size());
    beginPos = new ArrayList<Integer>(coreMap.get(TokensAnnotation.class).size());
    endPos = new ArrayList<Integer>(coreMap.get(TokensAnnotation.class).size());
    for (CoreLabel token : coreMap.get(TokensAnnotation.class)) {
        String word = token.originalText();
        tokens.add(word);
        beginPos.add(token.beginPosition());
        endPos.add(token.endPosition());
    }
    annotations = new ArrayList<BoaAnnotation>();
}

From source file:eu.fbk.dh.tint.tokenizer.ItalianTokenizer.java

License:Apache License

public static void main(String argv[]) throws IOException {

        ItalianTokenizer tokenizer = new ItalianTokenizer();

        //        byte[] file = Files.readAllBytes((new File("/Users/alessio/Desktop/milano.txt")).toPath());
        //        String text = new String(file);
        String text = "Clinton in testa nei sondaggi dopo lassoluzione dellFbi sulluso di un server di posta privato quando era Segretario di stato.";
        //        text = "``Determinato, pronto a fare tutto il necessario per mantenere la stabilit dei prezzi.''"
        //                + " Ma anche allarmato per come le conseguenze del referendum britannico minacciano leconomia e i mercati europei."
        //                + " Sono nato nel 200 S.p.A."
        //                + " Il mio indirizzo e-mail  alessio@apnetwork.it."
        //                + " Il blog  http://www.ziorufus.it e mi piace molto.";
        //        text = "Questo  un test per una sigla qualsiasi tipo a.B.C. che non ha senso.";
        //        text = "Milano (/milano/ ascolta[?info], in milanese: Milan[4], /mil?/[5])  una citt italiana di 1 346 153 abitanti[2], capoluogo dell'omonima citt metropolitana e della regione Lombardia, secondo comune italiano per numero di abitanti, tredicesimo comune dell'Unione europea e diciannovesimo del continente e, con l'agglomerato urbano, quarta area metropolitana pi popolata d'Europa dopo Londra, Madrid e Parigi[6].\n"
        //                + "\n"
        //                + "Fondata dagli Insubri all'inizio del VI secolo a.C.[7], fu conquistata dai Romani nel 222 a.C.";

        //        System.out.println(text);

        long time = System.currentTimeMillis();
        List<List<CoreLabel>> sentences = tokenizer.parse(text);
        time = System.currentTimeMillis() - time;

        for (int i = 0; i < Math.min(10, sentences.size()); i++) {
            List<CoreLabel> sentence = sentences.get(i);
            for (CoreLabel token : sentence) {
                System.out.println(token.word() + " -- " + token.originalText() + " -- " + token.beginPosition());

            }//w  w w . ja  v a  2s  .c  om
            System.out.println();
        }

        int sentenceSize = sentences.size();
        int lastTokenIndex = sentences.get(sentenceSize - 1).get(sentences.get(sentenceSize - 1).size() - 1)
                .index();
        System.out.println("Length: " + text.length());
        System.out.println("Time: " + time);
        System.out.println("Sentences: " + sentenceSize);
        System.out.println("Tokens: " + lastTokenIndex);
    }

From source file:eu.modelwriter.semantic.stanford_corenlp.MorphologySimilarityProvider.java

License:Open Source License

/**
 * {@inheritDoc}//  w  w  w .  j  a  v  a 2  s  .  c  om
 *
 * @see eu.modelwriter.semantic.ISemanticSimilarityProvider#getSemanticSimilarities(java.util.Map)
 */
public Map<String, Set<Object>> getSemanticSimilarities(Map<String, Set<Object>> labels) {
    final Map<String, Set<Object>> res = new LinkedHashMap<String, Set<Object>>();

    final StringBuilder builder = new StringBuilder();
    final Set<String> words = labels.keySet();
    if (!words.isEmpty()) {
        for (String label : words) {
            builder.append(label);
            builder.append(' ');
        }

        Annotation document = new Annotation(builder.substring(0, builder.length() - 1));
        PIPELINE.annotate(document);

        final List<CoreMap> sentences = document.get(SentencesAnnotation.class);
        for (CoreMap sentence : sentences) {
            for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
                final String label = token.originalText();
                final String lemma = token.lemma();
                Set<Object> lemmaSet = res.get(lemma);
                if (lemmaSet == null) {
                    lemmaSet = new LinkedHashSet<Object>();
                    res.put(lemma, lemmaSet);
                }
                final Set<Object> concepts = labels.get(label);
                if (concepts != null) {
                    lemmaSet.addAll(concepts);
                }
            }
        }
    }

    return res;
}

From source file:ims.cs.corenlp.TokenAligner.java

License:Open Source License

/**
 * Splits a CoreNLP token based on a position. We split only the word form as we don't have sufficient information
 * to split the lemma./*from www .  j  a v  a 2s  .  c  o  m*/
 * @param token
 * @param absPosition
 * @return
 */
private CoreLabel[] splitToken(CoreLabel token, int absPosition) {
    String word = token.word();
    String origText = token.originalText();

    // initialize parts
    CoreLabel[] splitting = new CoreLabel[2];
    splitting[0] = new CoreLabel(token);
    splitting[1] = new CoreLabel(token);

    // calculate split position
    int relPosition = absPosition - token.beginPosition();

    // cut up original text
    if (origText.length() >= relPosition) {
        String origText1 = origText.substring(0, relPosition);
        String origText2 = origText.substring(relPosition);

        splitting[0].setOriginalText(origText1);
        splitting[1].setOriginalText(origText2);
    }

    // cut up predicted text
    if (word.length() >= relPosition) {
        String word1 = word.substring(0, relPosition);
        String word2 = word.substring(relPosition);

        splitting[0].setWord(word1);
        splitting[1].setWord(word2);
    }

    // we could do the same with POS and lemma, but that would be complicated ...
    splitting[0].setEndPosition(absPosition); /* set a new end as we just shortened this token */
    splitting[1].setBeginPosition(absPosition); /* set a new position as we just shortened this token */

    // copy lemmas
    splitting[0].setLemma(token.lemma());
    splitting[1].setLemma(token.lemma());

    return splitting;
}

From source file:it.uniroma2.sag.kelp.input.parser.impl.StanfordParserWrapper.java

License:Apache License

@Override
public DependencyGraph parse(String sentenceString) {
    Annotation document = new Annotation(sentenceString);
    pipeline.annotate(document);// ww  w . j a  v a  2 s. co m
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    CoreMap sentence = sentences.get(0);
    DependencyGraph graph = new DependencyGraph();
    graph.setSentence(sentenceString);
    graph.setParserName("StanfordParser");
    graph.setParserVersion("3.6.0");
    graph.setNodes(new ArrayList<DGNode>());
    int nId = 1;
    for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
        DGNode node = new DGNode();
        Map<String, Object> nodeProps = new HashMap<String, Object>();
        nodeProps.put("surface", token.originalText());
        nodeProps.put("lemma", token.lemma());
        nodeProps.put("pos", token.tag());
        nodeProps.put("start", token.beginPosition());
        nodeProps.put("end", token.endPosition());
        nodeProps.put("id", nId);
        nId++;
        graph.getNodes().add(node);
        node.setProperties(nodeProps);
    }

    SemanticGraph dependencies = null;

    switch (dependencyType) {
    case BASIC:
        dependencies = sentence.get(BasicDependenciesAnnotation.class);
        break;
    case COLLAPSED:
        dependencies = sentence.get(CollapsedDependenciesAnnotation.class);
        break;
    case COLLAPSED_CCPROCESSED:
        dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
        break;
    default:
        dependencies = sentence.get(BasicDependenciesAnnotation.class);
        break;
    }
    dependencies.edgeListSorted();
    List<DGRelation> relations = new ArrayList<DGRelation>();
    for (IndexedWord node : dependencies.vertexListSorted()) {
        DGRelation relation = new DGRelation();
        relation.setProperties(new HashMap<String, Object>());
        DGNode child = graph.getDGNodeById(node.index());
        relation.setTarget(child);

        Collection<IndexedWord> parentsTmp = dependencies.getParents(node);
        ArrayList<IndexedWord> parents = new ArrayList<IndexedWord>();
        for (IndexedWord par : parentsTmp) {
            SemanticGraphEdge edge = dependencies.getEdge(par, node);
            DGNode parent = graph.getDGNodeById(edge.getGovernor().index());
            if (parent.getProperties().get("id") != child.getProperties().get("id"))
                parents.add(par);
        }

        if (parents.isEmpty()) {
            relation.getProperties().put("type", "root");
            relation.getProperties().put("fromId", new Integer(0));
            relation.setSource(null);
            graph.setRoot(relation);
        } else {
            Iterator<IndexedWord> it = parents.iterator();
            while (it.hasNext()) {
                IndexedWord par = it.next();
                SemanticGraphEdge edge = dependencies.getEdge(par, node);
                DGNode parent = graph.getDGNodeById(edge.getGovernor().index());

                relation.setSource(parent);
                relation.getProperties().put("fromId", parent.getProperties().get("id"));
                relation.getProperties().put("type", edge.getRelation().toString());
            }
        }
        relations.add(relation);
    }

    graph.setRelations(relations);
    return graph;
}

From source file:it.uniud.ailab.dcore.wrappers.external.StanfordBootstrapperAnnotator.java

License:Open Source License

/**
 * Annotate the document by splitting the document, tokenizing it,
 * performing PoS tagging and Named Entity Recognition using the Stanford
 * Core NLP tools./*from  www  . java  2s.  co m*/
 *
 * @param component the component to annotate.
 */
@Override
public void annotate(Blackboard blackboard, DocumentComponent component) {

    if (pipeline == null) {
        // creates a StanfordCoreNLP object, with POS tagging, lemmatization, 
        //NER, parsing, and coreference resolution 
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, pos, parse, lemma, ner, dcoref");
        pipeline = new StanfordCoreNLP(props);

    }

    // read some text in the text variable
    String text = component.getText();

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    // run all Annotators on this text
    pipeline.annotate(document);

    //get the graph for coreference resolution
    Map<Integer, CorefChain> graph = document.get(CorefCoreAnnotations.CorefChainAnnotation.class);

    //prepare the map for coreference graph of document
    Map<String, Collection<Set<CorefChain.CorefMention>>> coreferenceGraph = new HashMap<>();

    for (CorefChain corefChain : graph.values()) {

        //get the representative mention, that is the word recall in other sentences
        CorefChain.CorefMention cm = corefChain.getRepresentativeMention();

        //eliminate auto-references
        if (corefChain.getMentionMap().size() <= 1) {
            continue;
        }

        //get the stemmed form of the references, so the comparison with 
        //grams will be easier
        List<CoreLabel> tks = document.get(SentencesAnnotation.class).get(cm.sentNum - 1)
                .get(TokensAnnotation.class);
        //list of tokens which compose the anaphor

        List<Token> anaphorsTokens = new ArrayList<>();
        for (int i = cm.startIndex - 1; i < cm.endIndex - 1; i++) {
            CoreLabel current = tks.get(i);
            Token t = new Token(current.word());
            t.setPoS(current.tag());
            t.setLemma(current.lemma());
            anaphorsTokens.add(t);
        }

        //the mention n-gram which is formed by the anaphor and a 
        //list of references
        Mention mention = new Mention(cm.mentionSpan, anaphorsTokens, cm.mentionSpan);

        //get map of the references to the corefchain obj
        Collection<Set<CorefChain.CorefMention>> mentionMap = corefChain.getMentionMap().values();
        for (Set<CorefChain.CorefMention> mentions : mentionMap) {

            for (CorefChain.CorefMention reference : mentions) {
                //eliminate self-references
                if (reference.mentionSpan.equalsIgnoreCase(cm.mentionSpan)) {
                    continue;
                }
                List<CoreLabel> tokens = document.get(SentencesAnnotation.class).get(reference.sentNum - 1)
                        .get(TokensAnnotation.class);

                //list of tokens which compose the mention
                List<Token> mentionTokens = new ArrayList<>();
                for (int i = reference.startIndex - 1; i < reference.endIndex - 1; i++) {
                    CoreLabel current = tokens.get(i);
                    //set token features 
                    Token t = new Token(current.word());
                    t.setPoS(current.tag());
                    t.setLemma(current.lemma());
                    mentionTokens.add(t);
                }
                //add to mention a new reference
                mention.addReference(reference.mentionSpan, mentionTokens, reference.mentionType.toString());
            }
        }

        //assign to the document a new corenference obj
        //containing the anaphor and its mentions 
        blackboard.addGram(mention);
    }

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and 
    //has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    //A counter that keeps track of the number of phrases in a sentences
    int phraseCounter = 0;

    for (CoreMap stanfordSentence : sentences) {

        Sentence distilledSentence = new Sentence(stanfordSentence.toString(), "" + sentenceCounter++);

        distilledSentence.setLanguage(Locale.ENGLISH);

        //getting the dependency graph of the document so to count the number of phrases 
        //ROOT sentences are the first level children in the parse tree; every ROOT sentence
        //is constitute by a group of clauses which can be the principal (main clauses) or not
        //(coordinate and subordinate). We use ROOT sentences as a starting point to find out all
        //the phrases present in the sentences themselves, checking out for the tag "S".
        Tree sentenceTree = stanfordSentence.get(TreeCoreAnnotations.TreeAnnotation.class);

        for (Tree sub : sentenceTree.subTreeList()) {
            if (sub.label().value().equals("S")) {
                phraseCounter++;
            }
        }

        //annotate the sentence with a new feature counting all the phrases
        //cointained in the sentence    
        distilledSentence.addAnnotation(new FeatureAnnotation(DefaultAnnotations.PHRASES_COUNT, phraseCounter));

        // traversing the words in the current sentence
        // for each token in the text, we create a new token annotate it 
        // with the word representing it, its pos tag and its lemma
        for (CoreLabel token : stanfordSentence.get(TokensAnnotation.class)) {

            // this is the text of the token
            Token t = new Token(token.originalText());

            // this is the POS tag of the token                
            t.setPoS(token.tag());

            // this is the lemma of the ttoken
            t.setLemma(token.lemma());

            String ner = token.get(NamedEntityTagAnnotation.class);
            if (!ner.equalsIgnoreCase("O")) {
                t.addAnnotation(new NERAnnotation(DefaultAnnotations.IS_NER, ner));
            }
            //add the token to the sentence
            distilledSentence.addToken(t);
        }

        //add the sentence to document
        ((DocumentComposite) component).addComponent(distilledSentence);
    }
}

From source file:it.uniud.ailab.dcore.wrappers.external.StanfordFastBootstrapperAnnotator.java

License:Open Source License

/**
 * Annotate the document by splitting the document, tokenizing it,
 * performing PoS tagging and Named Entity Recognition using the Stanford
 * Core NLP tools.//from  ww  w.  j ava  2s  .  co m
 *
 * @param component the component to annotate.
 */
@Override
public void annotate(Blackboard blackboard, DocumentComponent component) {

    if (pipeline == null) {
        // creates a StanfordCoreNLP object, with POS tagging, lemmatization, 
        //NER, parsing, and coreference resolution 
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, pos, lemma");
        pipeline = new StanfordCoreNLP(props);

    }

    // read some text in the text variable
    String text = component.getText();

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(text);

    // run all Annotators on this text
    pipeline.annotate(document);

    // these are all the sentences in this document
    // a CoreMap is essentially a Map that uses class objects as keys and 
    //has values with custom types
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);

    for (CoreMap stanfordSentence : sentences) {

        Sentence distilledSentence = new Sentence(stanfordSentence.toString(), "" + sentenceCounter++);

        distilledSentence.setLanguage(Locale.ENGLISH);

        // traversing the words in the current sentence
        // for each token in the text, we create a new token annotate it 
        // with the word representing it, its pos tag and its lemma
        for (CoreLabel token : stanfordSentence.get(TokensAnnotation.class)) {

            // this is the text of the token
            Token t = new Token(token.originalText());

            // this is the POS tag of the token                
            t.setPoS(token.tag());

            // this is the lemma of the ttoken
            t.setLemma(token.lemma());

            //add the token to the sentence
            distilledSentence.addToken(t);
        }

        //add the sentence to document
        ((DocumentComposite) component).addComponent(distilledSentence);
    }
}

From source file:linguistic.Frame.java

public void createMap(String text, Map<String, WordInfo> map) throws IOException, ClassNotFoundException {

    String tag = "";
    Annotation document = null;//  w ww  .j  a  v  a  2  s.c om
    document = new Annotation(text);
    pipeline.annotate(document);
    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence : sentences) {
        for (CoreLabel word1 : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
            tag = word1.tag();
            textTag += word1.originalText() + "_" + tag + " ";
            String word = word1.originalText().toLowerCase();
            char c = word.charAt(0);
            if (!tag.equals("CD") && !tag.equals("CC") && !tag.equals("NNS") && !tag.equals("NNP")
                    && !tag.equals(".") && !tag.equals(":") && !tag.equals(";") && !tag.equals(",")
                    && isLetter(c)) {
                if (map.containsKey(word)) {
                    int fr = map.get(word).getFrequency() + 1;
                    if (map.get(word).getTag().contains("/" + tag + "/")) {
                        wordTagAmountMap.put(word + " - " + tag, wordTagAmountMap.get(word + " - " + tag) + 1);
                        map.put(word, new WordInfo(fr, map.get(word).getTag(), getBasicForm(word)));
                    } else {
                        wordTagAmountMap.put(word + " - " + tag, 1);
                        map.put(word, new WordInfo(fr, map.get(word).getTag() + tag + "/", getBasicForm(word)));
                    }
                } else {
                    wordTagAmountMap.put(word + " - " + tag, 1);
                    map.put(word, new WordInfo(1, "/" + tag + "/", getBasicForm(word)));
                }
            }

        }
    }

    /*StringBuffer word = new StringBuffer();
    for (int i = 0; i < text.length(); i++) {
      char c = text.charAt(i);
      if ( isLetter(c) ) {
    word.append(toLower(c));
      }
      else {
      if ( word.length() != 0 ) {
        if(c == 150 || c == 151 || c == 45){
            if(c > 0 && (text.charAt(i - 1) == 150 || text.charAt(i - 1) == 151 ||
                    text.charAt(i - 1) == 45 || isLetter(text.charAt(i + 1)) == false) ){
                continue;
            }
            word.append(c);
        }
        else{
            String tag = getTagger(word.toString());
                /*if(wordsDictionaryMap.get(word).getTag().contains("/" + tag + "/"))
                {
                    wordTagAmountMap.put(word + " - " + tag, wordTagAmountMap.get(word + " - " + tag) + 1);
                    wordsDictionaryMap.put(word, new WordInfo(fr, wordsDictionaryMap.get(word).getTag(), getBasicForm(word)));
                }
                else
                {
                    wordTagAmountMap.put(word + " - " + tag, 1);
                    wordsDictionaryMap.put(word, new WordInfo(fr, wordsDictionaryMap.get(word).getTag() + tag + "/", getBasicForm(word)));
                }*/

    /*if (wordsDictionaryMap.containsKey(word.toString())) {
        int fr = wordsDictionaryMap.get(word.toString()).getFrequency() + 1;
        if(wordsDictionaryMap.get(word.toString()).getTag().contains("/" + tag + "/"))
        {
            wordTagAmountMap.put(word.toString() + " - " + tag, wordTagAmountMap.get(word.toString() + " - " + tag) + 1);
            wordsDictionaryMap.put(word.toString(), new WordInfo(fr, wordsDictionaryMap.get(word.toString()).getTag(), getBasicForm(word.toString())));
        }
        else
        {
            wordTagAmountMap.put(word + " - " + tag, 1);
            wordsDictionaryMap.put(word.toString(), new WordInfo(fr, wordsDictionaryMap.get(word.toString()).getTag() + tag + "/", getBasicForm(word.toString())));
        }
            
               
       //wordsMap.put(word.toString(), new WordInfo(fr, getTagger(word.toString()), getBasicForm(word.toString())));
    }
    else {
       wordTagAmountMap.put(word + " - " + tag, 1);
       wordsDictionaryMap.put(word.toString(), new WordInfo(1, "/" + getTagger(word.toString() + "/"), getBasicForm(word.toString())));
    }
            
    word.delete(0, word.length());
    }
    }
    }     
    }*/
}