List of usage examples for edu.stanford.nlp.ling CoreLabel toString
@Override
public String toString()
From source file:SentencePair.java
License:Open Source License
private void createSentence(String text, List<POSTaggedToken> sentence) { Annotation d = new Annotation(text); nlp.annotate(d);// w w w . j a v a 2 s .c om for (CoreMap ss : d.get(CoreAnnotations.SentencesAnnotation.class)) { for (CoreLabel token : ss.get(CoreAnnotations.TokensAnnotation.class)) { sentence.add(new POSTaggedToken(token.toString(), translateTag(token.tag()))); } } }
From source file:edu.cmu.geolocator.nlp.StanfordCoreTools.StanfordNLP.java
License:Apache License
public void DoAll(String data, String[] TokenizedData, String[] POSTags, String[] LEMMA, Map<String, String> parentEdge, Map<String, ArrayList<String>> childrenEdge) { //if(document == null) {// w ww . j av a2 s . c o m document = new Annotation(data); pipelineTags.annotate(document); } int i = 0; List<CoreLabel> tokens = document.get(TokensAnnotation.class); for (CoreLabel token : tokens) { String wPOS = token.get(PartOfSpeechAnnotation.class); String wNER = token.get(NamedEntityTagAnnotation.class); String wLEMMA = token.get(LemmaAnnotation.class); TokenizedData[i] = token.toString(); POSTags[i] = wPOS; LEMMA[i] = wLEMMA; i++; } List<CoreMap> sentences = document.get(SentencesAnnotation.class); if (sentences.size() > 0) { SemanticGraph tree = sentences.get(0).get(BasicDependenciesAnnotation.class); //System.out.println(data+"\n"+tree.toString()+"\n"); createEdgeMap(tree, parentEdge, childrenEdge); } }
From source file:edu.cmu.geolocator.nlp.StanfordCoreTools.StanfordNLP.java
License:Apache License
public String[] StringTokenizer(String data) { document = new Annotation(data); pipeline.annotate(document);/*from w w w.j av a2s.c o m*/ String[] Tokens = null; List<CoreLabel> tokens = document.get(TokensAnnotation.class); Tokens = new String[tokens.size()]; int i = 0; for (CoreLabel t : tokens) { String tokenString = t.toString(); Tokens[i] = tokenString; i++; } return Tokens; }
From source file:eu.fbk.dkm.sectionextractor.pantheon.WikipediaGoodTextExtractor.java
License:Apache License
@Override public void contentPage(String text, String title, int wikiID) { if (pagesToConsider != null && !pagesToConsider.contains(title)) { return;/*from w w w . j av a 2 s .c o m*/ } if (idCategory != null && !idCategory.keySet().contains(wikiID)) { return; } try { String okTitle = title.trim().replace('_', ' '); String folderNumberName = Integer.toString(wikiID / MAX_FILES_PER_FOLDER); String fileName = Integer.toString(wikiID); String folderName = baseFolder.getAbsolutePath() + File.separator + folderNumberName; if (idCategory.get(wikiID) != null) { folderName = baseFolder.getAbsolutePath() + File.separator + idCategory.get(wikiID) + File.separator + folderNumberName; } File folder = new File(folderName); if (!folder.exists()) { folder.mkdirs(); } if (NAFformat) { fileName += ".naf"; } else { fileName += ".txt"; } String defFileName = folder.getAbsolutePath() + File.separator + fileName; File file = new File(defFileName); StringBuffer buffer = new StringBuffer(); WikipediaText wikipediaText = new WikipediaText(); buffer.append(wikipediaText.parse(text, null)); String rawText = buffer.toString(); buffer = new StringBuffer(); buffer.append(okTitle).append("\n").append("\n"); List<String> strings = Splitter.on('\n').trimResults()./*omitEmptyStrings().*/splitToList(rawText); for (String line : strings) { if (line.startsWith(categoryPrefix)) { continue; } if (line.startsWith("new:")) { continue; } buffer.append(line).append("\n"); } rawText = buffer.toString(); if (useStanford) { Annotation myDoc = new Annotation(rawText); pipeline.annotate(myDoc); StringBuffer tokenizedString = new StringBuffer(); List<CoreMap> sents = myDoc.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap thisSent : sents) { ArrayCoreMap sentenceCoreMap = (ArrayCoreMap) thisSent; List<CoreLabel> tokens = sentenceCoreMap.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel token : tokens) { tokenizedString.append(codeToParenthesis(token.toString())).append("\n"); } tokenizedString.append(eosTag).append("\n"); } rawText = tokenizedString.toString(); } if (NAFformat) { final KAFDocument document = new KAFDocument("en", "v3"); document.setRawText(rawText); document.createPublic(); document.getPublic().uri = String.format("https://%s.wikipedia.org/wiki/%s", getLocale().getLanguage(), title); document.createFileDesc(); document.getFileDesc().author = "MediaWiki"; document.getFileDesc().filename = fileName; document.getFileDesc().filetype = "Wikipedia article"; document.getFileDesc().title = okTitle; document.save(file.getAbsolutePath()); } else { try { logger.debug("Writing file " + file.getAbsolutePath()); BufferedWriter writer = new BufferedWriter(new FileWriter(file)); writer.write(rawText); writer.close(); } catch (Exception e) { logger.error(e.getMessage()); } } } catch (Exception e) { logger.error("Error processing page " + title + " (" + wikiID + ")"); } }
From source file:hmt.hocmay.utility.Ultilities.java
public static ArrayList<String> getWords(String filePath) { ArrayList<String> str = new ArrayList<String>(); PTBTokenizer<CoreLabel> ptbt;// www.j a v a2s . com try { ptbt = new PTBTokenizer<>(new FileReader(filePath), new CoreLabelTokenFactory(), "untokenizable=allDelete"); while (ptbt.hasNext()) { CoreLabel label = ptbt.next(); str.add(label.toString().toLowerCase()); // System.out.println(label.toString().toLowerCase()); } //Sp xp t Collections.sort(str, new Comparator<String>() { public int compare(String str1, String str2) { // System.out.println(str1+" "+str2); if (str1.compareTo(str2) > 0) { return 1; } else if (str1.compareTo(str2) == 0) { return 0; } else { return -1; } } }); //Loi b? t ging nhau for (int i = 0; i < str.size() - 1; i++) { if (str.get(i).equals(str.get(i + 1))) { str.remove(i); i--; } } } catch (FileNotFoundException ex) { } return str; }
From source file:nlp.pipeline.SentenceUtil.java
License:Open Source License
/** ************************************************************* * returns a list of strings that add tense, number, etc. information about words in input * ex. tense(PAST, Verb)// www . ja v a 2 s. co m * number(SINGULAR, Noun) */ public static List<String> findPOSInformation(List<CoreLabel> tokens, List<String> dependenciesList) { List<String> posInformation = Lists.newArrayList(); for (CoreLabel label : tokens) { Pattern auxPattern = Pattern.compile("aux\\(.*, " + label.toString() + "\\)"); boolean isAux = false; for (String dep : dependenciesList) { if (auxPattern.matcher(dep).find()) { isAux = true; break; } } if (!isAux) { boolean progressive = false; boolean perfect = false; String pos = label.get(PartOfSpeechAnnotation.class); if (LangLib.POS_VBD.equals(pos)) { posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PAST, label.toString())); } else if (LangLib.POS_VBP.equals(pos) || LangLib.POS_VBZ.equals(pos)) { posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PRESENT, label.toString())); } else if (LangLib.POS_VBG.equals(pos) || LangLib.POS_VB.equals(pos) || LangLib.POS_VBN.equals(pos)) { Pattern reverseAuxPattern = Pattern.compile("aux\\(" + label.toString() + ", .*-(\\d+)\\)"); for (String dep : dependenciesList) { Matcher auxMatcher = reverseAuxPattern.matcher(dep); if (auxMatcher.find()) { int i = Integer.parseInt(auxMatcher.group(1)); CoreLabel t = tokens.get(i - 1); if (t.get(LemmaAnnotation.class).equals("be")) { if (t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBP) || t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBZ)) { posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PRESENT, label.toString())); } else if (t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBD)) { posInformation.add( makeBinaryRelationship("tense", LangLib.TENSE_PAST, label.toString())); } progressive = true; } else if (t.get(LemmaAnnotation.class).equals("will")) { posInformation.add( makeBinaryRelationship("tense", LangLib.TENSE_FUTURE, label.toString())); } else if (t.get(LemmaAnnotation.class).equals("have")) { if (t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBP) || t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBZ)) { posInformation.add(makeBinaryRelationship("tense", LangLib.TENSE_PRESENT, label.toString())); } else if (t.get(PartOfSpeechAnnotation.class).equals(LangLib.POS_VBD)) { posInformation.add( makeBinaryRelationship("tense", LangLib.TENSE_PAST, label.toString())); } perfect = true; } } } } else if (LangLib.POS_NN.equals(pos) || LangLib.POS_NNP.equals(pos)) { posInformation.add(makeBinaryRelationship("number", LangLib.NUMBER_SINGULAR, label.toString())); } else if (LangLib.POS_NNS.equals(pos) || LangLib.POS_NNPS.equals(pos)) { posInformation.add(makeBinaryRelationship("number", LangLib.NUMBER_PLURAL, label.toString())); } if (progressive && perfect) { posInformation.add( makeBinaryRelationship("aspect", LangLib.ASPECT_PROGRESSIVE_PERFECT, label.toString())); } else if (progressive) { posInformation .add(makeBinaryRelationship("aspect", LangLib.ASPECT_PROGRESSIVE, label.toString())); } else if (perfect) { posInformation.add(makeBinaryRelationship("aspect", LangLib.ASPECT_PERFECT, label.toString())); } } } return posInformation; }
From source file:pltag.util.Utils.java
License:Open Source License
public static String tokenizeStanford(String line) { StringBuilder str = new StringBuilder(); Tokenizer<CoreLabel> tokenizer = new PTBTokenizer(new StringReader(line), new CoreLabelTokenFactory(), "asciiQuotes=true untokenizable=allDelete"); while (tokenizer.hasNext()) { CoreLabel label = tokenizer.next(); if (!label.toString().matches("``|\'\'|\"|-[LR][RCR]B-")) str.append(label).append(" "); }/* www . ja v a 2s .c o m*/ return str.toString().trim(); }
From source file:process.PTBTokenizer.java
License:Open Source License
private static int tokReader(Reader r, BufferedWriter writer, Pattern parseInsidePattern, String options, boolean preserveLines, boolean dump, boolean lowerCase) throws IOException { int numTokens = 0; boolean beginLine = true; boolean printing = (parseInsidePattern == null); // start off printing, // unless you're // looking for a // start entity Matcher m = null;/* w w w . j av a 2 s . co m*/ if (parseInsidePattern != null) { m = parseInsidePattern.matcher(""); // create once as performance // hack } for (PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), options); tokenizer.hasNext();) { CoreLabel obj = tokenizer.next(); // String origStr = // obj.get(CoreAnnotations.TextAnnotation.class).replaceFirst("\n+$", // ""); // DanC added this to fix a lexer bug, hopefully now // corrected String origStr = obj.get(CoreAnnotations.TextAnnotation.class); String str; if (lowerCase) { str = origStr.toLowerCase(Locale.ENGLISH); obj.set(CoreAnnotations.TextAnnotation.class, str); } else { str = origStr; } if (m != null && m.reset(origStr).matches()) { printing = m.group(1).isEmpty(); // turn on printing if no end // element slash, turn it // off it there is } else if (printing) { if (dump) { // after having checked for tags, change str to be // exhaustive str = obj.toString(); } if (preserveLines) { if (PTBLexer.NEWLINE_TOKEN.equals(origStr)) { beginLine = true; writer.newLine(); } else { if (!beginLine) { writer.write(' '); } else { beginLine = false; } // writer.write(str.replace("\n", "")); writer.write(str); } } else { writer.write(str); writer.newLine(); } } numTokens++; } return numTokens; }
From source file:semRewrite.substitutor.MUC.java
License:Open Source License
/**************************************************************** * @return a list of sentences with tokens *//*from w w w. jav a2 s. c o m*/ public ArrayList<ArrayList<String>> toCoref(String input) { //System.out.println("INFO in MUC.toCoref(): " + input); //System.out.println("INFO in MUC.toCoref(): " + input); List<Coref> corefs = buildCorefList(input); ArrayList<ArrayList<String>> results = new ArrayList<ArrayList<String>>(); StanfordCoreNLP pipeline = initPipeline(); document2 = new Annotation(input); System.out.println("MUC.toCoref(): after annotation"); try { pipeline.annotate(document2); //HybridCorefAnnotator hcoref = new HybridCorefAnnotator(props); //hcoref.annotate(document); } catch (Exception e) { System.out.println("input: " + input); System.out.println(e.getMessage()); e.printStackTrace(); } List<CoreMap> sentences = document2.get(CoreAnnotations.SentencesAnnotation.class); //SentenceUtil.printCorefChain(document); System.out.println("Stanford corefs: "); Map<Integer, CorefChain> graph = document2.get(CorefCoreAnnotations.CorefChainAnnotation.class); printStanfordCorefList(graph); for (CoreMap sentence : sentences) { //System.out.println(sentence); ArrayList<String> tokenList = new ArrayList<>(); //results.add(sentence.get(CoreAnnotations.TextAnnotation.class)); List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel t : tokens) { String t2 = t.toString(); if (t2.startsWith("-LRB-")) t2 = t2.replace("-LRB-", "("); if (t2.startsWith("-RRB-")) t2 = t2.replace("-RRB-", ")"); if (t2.startsWith("``")) t2 = t2.replace("``", "\""); if (t2.startsWith("''")) t2 = t2.replace("''", "\""); // -LCB-, -RCB-, ??? System.out.print(t2 + " "); tokenList.add(t2); } results.add(tokenList); System.out.println(); } return results; }
From source file:tr.edu.gsu.nerwip.recognition.internal.modelbased.stanford.StanfordConverter.java
License:Open Source License
@Override protected void writeRawResults(Article article, List<List<CoreLabel>> data) throws IOException { StringBuffer buffer = new StringBuffer(); for (List<CoreLabel> sentence : data) { for (CoreLabel expression : sentence) { String typeStr = expression.get(CoreAnnotations.AnswerAnnotation.class); // we ignore tokens without type if (!typeStr.equals(NOT_ENTITY)) { String string = expression.toString(); buffer.append(string + "\n"); }// w w w . j a v a2s. com } } writeRawResultsStr(article, buffer.toString()); }