List of usage examples for edu.stanford.nlp.process Tokenizer tokenize
List<T> tokenize();
From source file:ErrorCorrection.java
private static List<CoreLabel> tokenize(String str) { Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(str)); return tokenizer.tokenize(); }
From source file:artinex.TypDep.java
private List<CoreLabel> tokenize(String str) { Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(str)); return tokenizer.tokenize(); }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordPtbTransformer.java
License:Open Source License
@Override public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException { Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader(aInput.getDocumentText()), new CoreLabelTokenFactory(), "invertible"); for (CoreLabel label : tokenizer.tokenize()) { replace(label.beginPosition(), label.endPosition(), label.word()); }//from w w w .j a v a 2 s.co m }
From source file:de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter.java
License:Open Source License
@Override protected void process(JCas aJCas, String aText, int aZoneBegin) throws AnalysisEngineProcessException { List<Token> casTokens = null; // Use value from language parameter, document language or fallback language - whatever // is available String language = getLanguage(aJCas); if (isWriteToken()) { casTokens = new ArrayList<Token>(); final String text = aText; final Tokenizer<?> tokenizer = getTokenizer(language, aText); int offsetInSentence = 0; List<?> tokens = tokenizer.tokenize(); outer: for (int i = 0; i < tokens.size(); i++) { final Object token = tokens.get(i); // System.out.println("Token class: "+token.getClass()); String t = null;/*from www . ja v a 2s .c o m*/ if (token instanceof String) { t = (String) token; } if (token instanceof CoreLabel) { CoreLabel l = (CoreLabel) token; t = l.word(); int begin = l.get(CharacterOffsetBeginAnnotation.class); int end = l.get(CharacterOffsetEndAnnotation.class); casTokens.add(createToken(aJCas, aZoneBegin + begin, aZoneBegin + end, i)); offsetInSentence = end; continue; } if (token instanceof Word) { Word w = (Word) token; t = w.word(); } if (t == null) { throw new AnalysisEngineProcessException( new IllegalStateException("Unknown token type: " + token.getClass())); } // Skip whitespace while (isWhitespace(text.charAt(offsetInSentence))) { offsetInSentence++; if (offsetInSentence >= text.length()) { break outer; } } // Match if (text.startsWith(t, offsetInSentence)) { casTokens.add(createToken(aJCas, aZoneBegin + offsetInSentence, aZoneBegin + offsetInSentence + t.length(), i)); offsetInSentence = offsetInSentence + t.length(); } else { // System.out.println(aText); throw new AnalysisEngineProcessException(new IllegalStateException("Text mismatch. Tokenizer: [" + t + "] CAS: [" + text.substring(offsetInSentence, min(offsetInSentence + t.length(), text.length())))); } } } if (isWriteSentence()) { if (casTokens == null) { casTokens = selectCovered(aJCas, Token.class, aZoneBegin, aZoneBegin + aText.length()); } // Prepare the tokens for processing by WordToSentenceProcessor List<CoreLabel> tokensInDocument = new ArrayList<CoreLabel>(); for (Token token : casTokens) { CoreLabel l = new CoreLabel(); l.set(CharacterOffsetBeginAnnotation.class, token.getBegin()); l.set(CharacterOffsetEndAnnotation.class, token.getEnd()); l.setWord(token.getCoveredText()); tokensInDocument.add(l); } // The sentence splitter (probably) requires the escaped text, so we prepare it here PTBEscapingProcessor escaper = new PTBEscapingProcessor(); escaper.apply(tokensInDocument); // Apply the WordToSentenceProcessor to find the sentence boundaries WordToSentenceProcessor<CoreLabel> proc = new WordToSentenceProcessor<CoreLabel>(boundaryTokenRegex, boundaryFollowers, boundariesToDiscard, xmlBreakElementsToDiscard, regionElementRegex, newlineIsSentenceBreak, null, tokenRegexesToDiscard, isOneSentence, allowEmptySentences); List<List<CoreLabel>> sentencesInDocument = proc.process(tokensInDocument); for (List<CoreLabel> sentence : sentencesInDocument) { int begin = sentence.get(0).get(CharacterOffsetBeginAnnotation.class); int end = sentence.get(sentence.size() - 1).get(CharacterOffsetEndAnnotation.class); createSentence(aJCas, begin, end); } } }
From source file:edu.cmu.cs.in.hoop.hoops.transform.HoopSentence2Tokens.java
License:Open Source License
/** * *///from ww w . j a v a 2 s . co m public Boolean runHoop(HoopBase inHoop) { String result = ""; debug("runHoop ()"); TokenizerFactory<Word> factory = PTBTokenizerFactory.newTokenizerFactory(); ArrayList<HoopKV> inData = inHoop.getData(); if (inData != null) { HoopSimpleFeatureMaker featureMaker = new HoopSimpleFeatureMaker(); result = "Number of sentences in input :: " + inData.size(); for (int i = 0; i < inData.size(); i++) { //HoopKVInteger aKV=(HoopKVInteger) inData.get(i); HoopKV aKV = inData.get(i); HoopKV newKV = createKV(aKV); //debug ("Processing item: " + i + " with value: " + aKV.getValueAsString()); //>------------------------------------------------------------------------ if (targetTokenizer.getValue().equalsIgnoreCase("SplitOnCharacter") == true) { //debug ("Using builtin tokenizer ..."); List<String> tokens = featureMaker.unigramTokenizeOnCharacter(aKV.getValueAsString(), splitCharacter.getPropValue()); //debug ("Extracted " + tokens.size()); if (generateMode.getPropValue().equalsIgnoreCase("Add") == true) { //debug ("Generate mode is Add"); //HoopKVInteger newToken=new HoopKVInteger (); for (int j = 0; j < tokens.size(); j++) { String aToken = tokens.get(j); String strippedInput = aToken; //debug ("final input for new token: " + strippedInput); if (removePunctuation.getPropValue() == true) strippedInput = aToken.replaceAll(splitRegEx.getValue(), ""); //newToken.setKey (i); //newToken.setValue (strippedInput, j); Integer keyFormatter = i; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(strippedInput, j); } //addKV (newToken); addKV(newKV); } else { //debug ("Generate mode is New"); for (int j = 0; j < tokens.size(); j++) { String aToken = tokens.get(j); String strippedInput = aToken; if (removePunctuation.getPropValue() == true) strippedInput = aToken.replaceAll(splitRegEx.getValue(), ""); //debug ("final input for new token: " + strippedInput); if (this.reKey.getPropValue() == false) { Integer keyFormatter = j; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(strippedInput); addKV(newKV); //addKV (new HoopKVInteger (j,strippedInput)); } else { Integer keyFormatter = i; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(strippedInput); addKV(newKV); //addKV (new HoopKVInteger (i,strippedInput)); } } } } //>------------------------------------------------------------------------ if (targetTokenizer.getValue().equalsIgnoreCase("RegEx") == true) { //debug ("Using builtin tokenizer ..."); List<String> tokens = featureMaker.unigramTokenizeBasic(aKV.getValueAsString()); //debug ("Extracted " + tokens.size()); if (generateMode.getPropValue().equalsIgnoreCase("Add") == true) { //debug ("Generate mode is Add"); //HoopKVInteger newToken=new HoopKVInteger (); for (int j = 0; j < tokens.size(); j++) { String aToken = tokens.get(j); String strippedInput = aToken; //debug ("final input for new token: " + strippedInput); if (removePunctuation.getPropValue() == true) strippedInput = aToken.replaceAll(splitRegEx.getValue(), ""); Integer keyFormatter = i; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(strippedInput, j); } //addKV (newToken); addKV(newKV); } else { //debug ("Generate mode is New"); for (int j = 0; j < tokens.size(); j++) { String aToken = tokens.get(j); String strippedInput = aToken; if (removePunctuation.getPropValue() == true) strippedInput = aToken.replaceAll(splitRegEx.getValue(), ""); //debug ("final input for new token: " + strippedInput); if (this.reKey.getPropValue() == false) { Integer keyFormatter = j; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(strippedInput); addKV(newKV); //addKV (new HoopKVInteger (j,strippedInput)); } else { Integer keyFormatter = i; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(strippedInput); addKV(newKV); //addKV (new HoopKVInteger (i,strippedInput)); } } } } //>------------------------------------------------------------------------ if (targetTokenizer.getValue().equalsIgnoreCase("Stanford") == true) { //debug ("Using stanford tokenizer ..."); Tokenizer<Word> tokenizer = factory.getTokenizer(new StringReader(aKV.getValueAsString())); List<Word> sTokens = tokenizer.tokenize(); //debug ("Extracted " + sTokens.size()); for (int t = 0; t < sTokens.size(); t++) { Word aTerm = sTokens.get(t); if (this.reKey.getPropValue() == false) { Integer keyFormatter = t; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(aTerm.toString()); addKV(newKV); //addKV (new HoopKVInteger (j,strippedInput)); } else { Integer keyFormatter = i; newKV.setKeyString(keyFormatter.toString()); newKV.setValue(aTerm.toString()); addKV(newKV); //addKV (new HoopKVInteger (i,strippedInput)); } } } //>------------------------------------------------------------------------ updateProgressStatus(i, inData.size()); } } else return (false); HoopStatisticsPanel statsPanel; if (HoopLink.getWindow("Statistics") != null) { statsPanel = (HoopStatisticsPanel) HoopLink.getWindow("Statistics"); } else { statsPanel = new HoopStatisticsPanel(); } HoopLink.addView("Statistics", statsPanel, HoopLink.bottom); statsPanel.appendString("\n" + result); return (true); }
From source file:edu.uoa.cs.master.cloudmanufacturingnlp.business.nlp.StanfordDependencies.java
License:Apache License
/** * Parse the input raw sentence, output the Stanford Dependencies. * <p>/* w ww. j a va 2 s .c om*/ * An output example is:shares-3={companyC-2=nsubj} * </p> * * @param triples * @param naturalLanguageRule * @return */ public String parseNaturalLanguage(Map<String, Map<String, String>> triples, String naturalLanguageRule) { String action = null; Tokenizer<? extends HasWord> toke = tokenizerFactory.getTokenizer(new StringReader(naturalLanguageRule)); List<? extends HasWord> sentence = toke.tokenize(); Tree parse = lp.parse(sentence); EnglishGrammaticalStructure gs = (EnglishGrammaticalStructure) gsf.newGrammaticalStructure(parse); Collection<TypedDependency> tdl = gs.typedDependencies(); for (TypedDependency dependency : tdl) { String gov = dependency.gov().toString(); String reln = dependency.reln().toString(); String dep = dependency.dep().toString(); if (triples.containsKey(gov)) { triples.get(gov).put(dep, reln); } else { Map<String, String> triple = new HashMap<String, String>(); triple.put(dep, reln); triples.put(gov, triple); } if (reln.equalsIgnoreCase(Constants.Nlp.NSUBJ)) { action = gov; } } return action; }
From source file:englishparser.EnglishParser.java
/** * demoAPI demonstrates other ways of calling the parser with already * tokenized text, or in some cases, raw text that needs to be tokenized as * a single sentence. Output is handled with a TreePrint object. Note that * the options used when creating the TreePrint can determine what results * to print out. Once again, one can capture the output by passing a * PrintWriter to TreePrint.printTree./*from w ww . j a v a2s . c om*/ */ public static void demoAPI(LexicalizedParser lp) { // This option shows parsing a list of correctly tokenized words String[] sent = { "This", "is", "an", "easy", "sentence", "." }; List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent); Tree parse = lp.apply(rawWords); parse.pennPrint(); System.out.println(); // This option shows loading and using an explicit tokenizer String sent2 = "This is another sentence."; TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2)); List<CoreLabel> rawWords2 = tok.tokenize(); parse = lp.apply(rawWords2); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); System.out.println(); // You can also use a TreePrint object to print trees and dependencies TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(parse); }
From source file:ie.pars.aclrdtec.fileutils.GetStatRawTextFile.java
License:Open Source License
public static void main(String[] ss) throws SAXException, ParserConfigurationException, IOException { String input = ss[0]; //path to the input folder GetFiles gf = new GetFiles(); gf.getCorpusFiles(input);//from ww w . ja v a 2 s .c o m List<String> annotationFiles = gf.getFiles(); System.out.println("There are " + annotationFiles.size() + " files to check!"); TokenizerFactory<Word> newTokenizerFactory = PTBTokenizer.PTBTokenizerFactory.newTokenizerFactory(); int sentenceNumber = 0; int wordSize = 0; for (String file : annotationFiles) { File f = new File(file); Document makeDOM = XMLMethod.makeDOM(file); NodeList elementsByTagName = makeDOM.getElementsByTagName("S"); sentenceNumber += elementsByTagName.getLength(); for (int i = 0; i < elementsByTagName.getLength(); i++) { String sentence = elementsByTagName.item(i).getTextContent(); StringReader sr = new StringReader(sentence); Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr); List<Word> tokenize = tokenizer.tokenize(); wordSize += tokenize.size(); } } System.out.println(sentenceNumber); System.out.println(wordSize); }
From source file:ie.pars.bnc.preprocess.ProcessNLP.java
License:Open Source License
private static StringBuilder parseTheSentence(String sentence, Morphology morphology, MaxentTagger posTagger, ParserGrammar parser, String sid) { TokenizerFactory<Word> newTokenizerFactory = PTBTokenizerFactory.newTokenizerFactory(); // TokenizerFactory<WordLemmaTag> tokenizerFactory; // TokenizerFactory<CoreLabel> factory = PTBTokenizer.factory(new CoreLabelTokenFactory() , ""); // TokenizerFactory<Word> factory1 = PTBTokenizer.factory(); StringBuilder results = new StringBuilder(); results.append("<s id='" + sid + "'>\n"); StringReader sr = new StringReader(sentence); Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr); List<Word> tokenize = tokenizer.tokenize(); List<TaggedWord> tagSentence = posTagger.tagSentence(tokenize); Tree parseTree = parser.parse(tagSentence); GrammaticalStructure gs = parser.getTLPParams().getGrammaticalStructure(parseTree, parser.treebankLanguagePack().punctuationWordRejectFilter(), parser.getTLPParams().typedDependencyHeadFinder()); Collection<TypedDependency> deps = gs.typedDependenciesCollapsedTree(); SemanticGraph depTree = new SemanticGraph(deps); for (int i = 0; i < tagSentence.size(); ++i) { int head = -1; String deprel = null;// w w w . j av a2s . c o m // if (depTree != null) { Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index).collect(Collectors.toSet()); IndexedWord node = depTree.getNodeByIndexSafe(i + 1); if (node != null) { List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node); if (!edgeList.isEmpty()) { assert edgeList.size() == 1; head = edgeList.get(0).getGovernor().index(); deprel = edgeList.get(0).getRelation().toString(); } else if (rootSet.contains(i + 1)) { head = 0; deprel = "ROOT"; } } // } // Write the token TaggedWord lexHead = null; if (head > 0) { lexHead = tagSentence.get(head - 1); } results.append(line(i + 1, tagSentence.get(i), morphology, head, deprel, lexHead)).append("\n"); } results.append("</s>\n"); return results; }
From source file:info.atmykitchen.basic_annotation_convert.ConvertToBIO.java
License:Open Source License
private static void convertFile(File file, String annotator, PrintWriter printer) throws ParserConfigurationException, IOException, Exception { System.out.println(file.getAbsolutePath()); AnnotationFile annotationFile = IOMethods.loadAnnotationFile(file); Map<Integer, List<Annotation>> annotationLstMap = annotationFile.getAnnotationMapSentence(); TokenizerFactory<Word> newTokenizerFactory = PTBTokenizer.PTBTokenizerFactory.newTokenizerFactory(); String currentLabel = ""; int previousEnd = 0; printer.println("<doc id=\"" + annotationFile.getAclid() + "\" title=\"" + annotationFile.getTitle() + "\" annotatorid=\"" + annotator + "\">"); for (int i = 0; i < annotationFile.getSentences().size(); i++) { String sid = (i + 1) + "-" + annotationFile.getAclid(); printer.println("<s id=\"" + sid + "\" annotatorid=\"" + annotator + "\">"); String sentence = annotationFile.getSentences().get(i); System.out.println(sentence); StringReader sr = new StringReader(sentence); Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr); List<Word> tokenize = tokenizer.tokenize(); List<TaggedWord> tagSentence = tagger.tagSentence(tokenize); List<Annotation> sentenceAnnotationList = new ArrayList<>(); if (annotationLstMap.containsKey(i)) { sentenceAnnotationList = annotationLstMap.get(i); }// ww w .j av a 2 s . c o m System.out.println(sentenceAnnotationList.size()); Collections.sort(sentenceAnnotationList, Annotation.sentnceOrderComp()); List<Integer> toEnd = new ArrayList(); for (int j = 0; j < tagSentence.size(); j++) { //to add <g/> gap tags if (j == 0) { previousEnd = tagSentence.get(j).endPosition(); } else { if (previousEnd == tagSentence.get(j).beginPosition()) { printer.println("<g/>"); } previousEnd = tagSentence.get(j).endPosition(); } int startoffset = tagSentence.get(j).beginPosition(); if (!toEnd.isEmpty()) { Collections.sort(toEnd); while (!toEnd.isEmpty() && startoffset >= toEnd.get(0)) { currentLabel = ""; //System.out.println("** "+toEnd.get(0)); printer.println("</term>"); toEnd.remove(0); } } // this is based on the fact that currently we do not have nested annotations, // while the inner annotations work assignin labels to them for ske engine is going to be a bit problamatic, the best solution is to use multivalue feature of ske but this is something to be dealt in the future if (!sentenceAnnotationList.isEmpty()) { while (!sentenceAnnotationList.isEmpty() && sentenceAnnotationList.get(0).getStartOffsetSentence() <= startoffset) { Annotation remove = sentenceAnnotationList.remove(0); toEnd.add(remove.getStartOffsetSentence() + remove.getContent().length()); printer.println("<term class=\"" + remove.getType() + "\" id=\"" + j + "-" + sid + "\" annotatorid=\"" + annotator + "\">"); currentLabel = remove.getType(); } } printer.println( sentence.substring(tagSentence.get(j).beginPosition(), tagSentence.get(j).endPosition()) + "\t" + m.lemma(tagSentence.get(j).word(), tagSentence.get(j).tag()) + "\t" + tagSentence.get(j).tag()); } printer.println("</s>"); } printer.println("</doc>"); }