Example usage for edu.stanford.nlp.ling CoreLabel toString

List of usage examples for edu.stanford.nlp.ling CoreLabel toString

Introduction

In this page you can find the example usage for edu.stanford.nlp.ling CoreLabel toString.

Prototype

@Override
    public String toString() 

Source Link

Usage

From source file:SentencePair.java

License:Open Source License

private void createSentence(String text, List<POSTaggedToken> sentence) {
    Annotation d = new Annotation(text);
    nlp.annotate(d);//  w w w .  j a v  a 2  s  .c om

    for (CoreMap ss : d.get(CoreAnnotations.SentencesAnnotation.class)) {
        for (CoreLabel token : ss.get(CoreAnnotations.TokensAnnotation.class)) {
            sentence.add(new POSTaggedToken(token.toString(), translateTag(token.tag())));
        }
    }
}

From source file:edu.cmu.geolocator.nlp.StanfordCoreTools.StanfordNLP.java

License:Apache License

public void DoAll(String data, String[] TokenizedData, String[] POSTags, String[] LEMMA,
        Map<String, String> parentEdge, Map<String, ArrayList<String>> childrenEdge) {
    //if(document == null)
    {//  w ww  .  j av a2 s  . c  o  m
        document = new Annotation(data);
        pipelineTags.annotate(document);
    }
    int i = 0;
    List<CoreLabel> tokens = document.get(TokensAnnotation.class);

    for (CoreLabel token : tokens) {

        String wPOS = token.get(PartOfSpeechAnnotation.class);
        String wNER = token.get(NamedEntityTagAnnotation.class);
        String wLEMMA = token.get(LemmaAnnotation.class);
        TokenizedData[i] = token.toString();
        POSTags[i] = wPOS;
        LEMMA[i] = wLEMMA;
        i++;
    }
    List<CoreMap> sentences = document.get(SentencesAnnotation.class);
    if (sentences.size() > 0) {
        SemanticGraph tree = sentences.get(0).get(BasicDependenciesAnnotation.class);
        //System.out.println(data+"\n"+tree.toString()+"\n");
        createEdgeMap(tree, parentEdge, childrenEdge);
    }

}

From source file:edu.cmu.geolocator.nlp.StanfordCoreTools.StanfordNLP.java

License:Apache License

public String[] StringTokenizer(String data) {
    document = new Annotation(data);
    pipeline.annotate(document);/*from   w  w w.j av a2s.c o  m*/
    String[] Tokens = null;
    List<CoreLabel> tokens = document.get(TokensAnnotation.class);
    Tokens = new String[tokens.size()];
    int i = 0;
    for (CoreLabel t : tokens) {
        String tokenString = t.toString();
        Tokens[i] = tokenString;
        i++;
    }
    return Tokens;
}

From source file:eu.fbk.dkm.sectionextractor.pantheon.WikipediaGoodTextExtractor.java

License:Apache License

@Override
public void contentPage(String text, String title, int wikiID) {

    if (pagesToConsider != null && !pagesToConsider.contains(title)) {
        return;/*from  w  w  w . j av a  2  s .c o m*/
    }
    if (idCategory != null && !idCategory.keySet().contains(wikiID)) {
        return;
    }

    try {

        String okTitle = title.trim().replace('_', ' ');

        String folderNumberName = Integer.toString(wikiID / MAX_FILES_PER_FOLDER);
        String fileName = Integer.toString(wikiID);

        String folderName = baseFolder.getAbsolutePath() + File.separator + folderNumberName;
        if (idCategory.get(wikiID) != null) {
            folderName = baseFolder.getAbsolutePath() + File.separator + idCategory.get(wikiID) + File.separator
                    + folderNumberName;
        }
        File folder = new File(folderName);
        if (!folder.exists()) {
            folder.mkdirs();
        }

        if (NAFformat) {
            fileName += ".naf";
        } else {
            fileName += ".txt";
        }
        String defFileName = folder.getAbsolutePath() + File.separator + fileName;

        File file = new File(defFileName);

        StringBuffer buffer = new StringBuffer();

        WikipediaText wikipediaText = new WikipediaText();
        buffer.append(wikipediaText.parse(text, null));

        String rawText = buffer.toString();
        buffer = new StringBuffer();
        buffer.append(okTitle).append("\n").append("\n");
        List<String> strings = Splitter.on('\n').trimResults()./*omitEmptyStrings().*/splitToList(rawText);
        for (String line : strings) {
            if (line.startsWith(categoryPrefix)) {
                continue;
            }
            if (line.startsWith("new:")) {
                continue;
            }
            buffer.append(line).append("\n");
        }

        rawText = buffer.toString();

        if (useStanford) {
            Annotation myDoc = new Annotation(rawText);
            pipeline.annotate(myDoc);

            StringBuffer tokenizedString = new StringBuffer();

            List<CoreMap> sents = myDoc.get(CoreAnnotations.SentencesAnnotation.class);
            for (CoreMap thisSent : sents) {
                ArrayCoreMap sentenceCoreMap = (ArrayCoreMap) thisSent;
                List<CoreLabel> tokens = sentenceCoreMap.get(CoreAnnotations.TokensAnnotation.class);
                for (CoreLabel token : tokens) {
                    tokenizedString.append(codeToParenthesis(token.toString())).append("\n");
                }
                tokenizedString.append(eosTag).append("\n");
            }

            rawText = tokenizedString.toString();
        }

        if (NAFformat) {

            final KAFDocument document = new KAFDocument("en", "v3");

            document.setRawText(rawText);

            document.createPublic();
            document.getPublic().uri = String.format("https://%s.wikipedia.org/wiki/%s",
                    getLocale().getLanguage(), title);

            document.createFileDesc();
            document.getFileDesc().author = "MediaWiki";
            document.getFileDesc().filename = fileName;
            document.getFileDesc().filetype = "Wikipedia article";
            document.getFileDesc().title = okTitle;

            document.save(file.getAbsolutePath());
        } else {
            try {
                logger.debug("Writing file " + file.getAbsolutePath());
                BufferedWriter writer = new BufferedWriter(new FileWriter(file));
                writer.write(rawText);
                writer.close();
            } catch (Exception e) {
                logger.error(e.getMessage());
            }
        }

    } catch (Exception e) {
        logger.error("Error processing page " + title + " (" + wikiID + ")");
    }
}

From source file:hmt.hocmay.utility.Ultilities.java

public static ArrayList<String> getWords(String filePath) {
    ArrayList<String> str = new ArrayList<String>();
    PTBTokenizer<CoreLabel> ptbt;// www.j a v  a2s . com
    try {
        ptbt = new PTBTokenizer<>(new FileReader(filePath), new CoreLabelTokenFactory(),
                "untokenizable=allDelete");
        while (ptbt.hasNext()) {
            CoreLabel label = ptbt.next();
            str.add(label.toString().toLowerCase());
            //                System.out.println(label.toString().toLowerCase());
        }
        //Sp xp t
        Collections.sort(str, new Comparator<String>() {
            public int compare(String str1, String str2) {
                //                System.out.println(str1+" "+str2);
                if (str1.compareTo(str2) > 0) {
                    return 1;
                } else if (str1.compareTo(str2) == 0) {
                    return 0;
                } else {
                    return -1;
                }
            }
        });
        //Loi b? t ging nhau
        for (int i = 0; i < str.size() - 1; i++) {
            if (str.get(i).equals(str.get(i + 1))) {
                str.remove(i);
                i--;
            }
        }
    } catch (FileNotFoundException ex) {

    }
    return str;
}

From source file:nlp.pipeline.SentenceUtil.java

License:Open Source License

/** *************************************************************
 * returns a list of strings that add tense, number, etc. information about words in input
 * ex.  tense(PAST, Verb)// www  . ja v  a  2  s.  co  m
 *      number(SINGULAR, Noun)
 */
public static List<String> findPOSInformation(List<CoreLabel> tokens, List<String> dependenciesList) {

    List<String> posInformation = Lists.newArrayList();
    for (CoreLabel label : tokens) {
        Pattern auxPattern = Pattern.compile("aux\\(.*, " + label.toString() + "\\)");
        boolean isAux = false;
        for (String dep : dependenciesList) {
            if (auxPattern.matcher(dep).find()) {
                isAux = true;
                break;
            }
        }
        if (!isAux) {
            boolean progressive = false;
            boolean perfect = false;
            String pos = label.get(PartOfSpeechAnnotation.class);
            if (LangLib.POS_VBD.equals(pos)) {
                posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PAST, label.toString()));
            } else if (LangLib.POS_VBP.equals(pos) || LangLib.POS_VBZ.equals(pos)) {
                posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PRESENT, label.toString()));
            } else if (LangLib.POS_VBG.equals(pos) || LangLib.POS_VB.equals(pos)
                    || LangLib.POS_VBN.equals(pos)) {
                Pattern reverseAuxPattern = Pattern.compile("aux\\(" + label.toString() + ", .*-(\\d+)\\)");
                for (String dep : dependenciesList) {
                    Matcher auxMatcher = reverseAuxPattern.matcher(dep);
                    if (auxMatcher.find()) {
                        int i = Integer.parseInt(auxMatcher.group(1));
                        CoreLabel t = tokens.get(i - 1);
                        if (t.get(LemmaAnnotation.class).equals("be")) {
                            if (t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBP)
                                    || t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBZ)) {
                                posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PRESENT,
                                        label.toString()));
                            } else if (t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBD)) {
                                posInformation.add(
                                        makeBinaryRelationship("tense", LangLib.TENSE_PAST, label.toString()));
                            }
                            progressive = true;
                        } else if (t.get(LemmaAnnotation.class).equals("will")) {
                            posInformation.add(
                                    makeBinaryRelationship("tense", LangLib.TENSE_FUTURE, label.toString()));
                        } else if (t.get(LemmaAnnotation.class).equals("have")) {
                            if (t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBP)
                                    || t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBZ)) {
                                posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PRESENT,
                                        label.toString()));
                            } else if (t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBD)) {
                                posInformation.add(
                                        makeBinaryRelationship("tense", LangLib.TENSE_PAST, label.toString()));
                            }
                            perfect = true;
                        }
                    }
                }
            } else if (LangLib.POS_NN.equals(pos) || LangLib.POS_NNP.equals(pos)) {
                posInformation.add(makeBinaryRelationship("number", LangLib.NUMBER_SINGULAR, label.toString()));
            } else if (LangLib.POS_NNS.equals(pos) || LangLib.POS_NNPS.equals(pos)) {
                posInformation.add(makeBinaryRelationship("number", LangLib.NUMBER_PLURAL, label.toString()));
            }

            if (progressive && perfect) {
                posInformation.add(
                        makeBinaryRelationship("aspect", LangLib.ASPECT_PROGRESSIVE_PERFECT, label.toString()));
            } else if (progressive) {
                posInformation
                        .add(makeBinaryRelationship("aspect", LangLib.ASPECT_PROGRESSIVE, label.toString()));
            } else if (perfect) {
                posInformation.add(makeBinaryRelationship("aspect", LangLib.ASPECT_PERFECT, label.toString()));
            }
        }
    }
    return posInformation;
}

From source file:pltag.util.Utils.java

License:Open Source License

public static String tokenizeStanford(String line) {
    StringBuilder str = new StringBuilder();
    Tokenizer<CoreLabel> tokenizer = new PTBTokenizer(new StringReader(line), new CoreLabelTokenFactory(),
            "asciiQuotes=true untokenizable=allDelete");
    while (tokenizer.hasNext()) {
        CoreLabel label = tokenizer.next();
        if (!label.toString().matches("``|\'\'|\"|-[LR][RCR]B-"))
            str.append(label).append(" ");
    }/*  www  .  ja v  a 2s .c  o m*/
    return str.toString().trim();
}

From source file:process.PTBTokenizer.java

License:Open Source License

private static int tokReader(Reader r, BufferedWriter writer, Pattern parseInsidePattern, String options,
        boolean preserveLines, boolean dump, boolean lowerCase) throws IOException {
    int numTokens = 0;
    boolean beginLine = true;
    boolean printing = (parseInsidePattern == null); // start off printing,
    // unless you're
    // looking for a
    // start entity
    Matcher m = null;/* w w w  . j  av a 2 s . co  m*/
    if (parseInsidePattern != null) {
        m = parseInsidePattern.matcher(""); // create once as performance
        // hack
    }
    for (PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(),
            options); tokenizer.hasNext();) {
        CoreLabel obj = tokenizer.next();
        // String origStr =
        // obj.get(CoreAnnotations.TextAnnotation.class).replaceFirst("\n+$",
        // ""); // DanC added this to fix a lexer bug, hopefully now
        // corrected
        String origStr = obj.get(CoreAnnotations.TextAnnotation.class);
        String str;
        if (lowerCase) {
            str = origStr.toLowerCase(Locale.ENGLISH);
            obj.set(CoreAnnotations.TextAnnotation.class, str);
        } else {
            str = origStr;
        }
        if (m != null && m.reset(origStr).matches()) {
            printing = m.group(1).isEmpty(); // turn on printing if no end
            // element slash, turn it
            // off it there is
        } else if (printing) {
            if (dump) {
                // after having checked for tags, change str to be
                // exhaustive
                str = obj.toString();
            }
            if (preserveLines) {
                if (PTBLexer.NEWLINE_TOKEN.equals(origStr)) {
                    beginLine = true;
                    writer.newLine();
                } else {
                    if (!beginLine) {
                        writer.write(' ');
                    } else {
                        beginLine = false;
                    }
                    // writer.write(str.replace("\n", ""));
                    writer.write(str);
                }
            } else {
                writer.write(str);
                writer.newLine();
            }
        }
        numTokens++;
    }
    return numTokens;
}

From source file:semRewrite.substitutor.MUC.java

License:Open Source License

/****************************************************************
 * @return a list of sentences with tokens
 *//*from   w w w. jav  a2  s. c  o m*/
public ArrayList<ArrayList<String>> toCoref(String input) {

    //System.out.println("INFO in MUC.toCoref(): " + input);
    //System.out.println("INFO in MUC.toCoref(): " + input);
    List<Coref> corefs = buildCorefList(input);
    ArrayList<ArrayList<String>> results = new ArrayList<ArrayList<String>>();
    StanfordCoreNLP pipeline = initPipeline();
    document2 = new Annotation(input);
    System.out.println("MUC.toCoref(): after annotation");
    try {
        pipeline.annotate(document2);
        //HybridCorefAnnotator hcoref = new HybridCorefAnnotator(props);
        //hcoref.annotate(document);
    } catch (Exception e) {
        System.out.println("input: " + input);
        System.out.println(e.getMessage());
        e.printStackTrace();
    }
    List<CoreMap> sentences = document2.get(CoreAnnotations.SentencesAnnotation.class);
    //SentenceUtil.printCorefChain(document);
    System.out.println("Stanford corefs: ");
    Map<Integer, CorefChain> graph = document2.get(CorefCoreAnnotations.CorefChainAnnotation.class);
    printStanfordCorefList(graph);

    for (CoreMap sentence : sentences) {
        //System.out.println(sentence);
        ArrayList<String> tokenList = new ArrayList<>();
        //results.add(sentence.get(CoreAnnotations.TextAnnotation.class));
        List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        for (CoreLabel t : tokens) {
            String t2 = t.toString();
            if (t2.startsWith("-LRB-"))
                t2 = t2.replace("-LRB-", "(");
            if (t2.startsWith("-RRB-"))
                t2 = t2.replace("-RRB-", ")");
            if (t2.startsWith("``"))
                t2 = t2.replace("``", "\"");
            if (t2.startsWith("''"))
                t2 = t2.replace("''", "\"");
            // -LCB-,  -RCB-, ???
            System.out.print(t2 + " ");
            tokenList.add(t2);
        }
        results.add(tokenList);
        System.out.println();
    }
    return results;
}

From source file:tr.edu.gsu.nerwip.recognition.internal.modelbased.stanford.StanfordConverter.java

License:Open Source License

@Override
protected void writeRawResults(Article article, List<List<CoreLabel>> data) throws IOException {
    StringBuffer buffer = new StringBuffer();

    for (List<CoreLabel> sentence : data) {
        for (CoreLabel expression : sentence) {
            String typeStr = expression.get(CoreAnnotations.AnswerAnnotation.class);
            // we ignore tokens without type
            if (!typeStr.equals(NOT_ENTITY)) {
                String string = expression.toString();
                buffer.append(string + "\n");
            }// w w  w  .  j  a v a2s. com
        }
    }

    writeRawResultsStr(article, buffer.toString());
}