List of usage examples for edu.stanford.nlp.process TokenizerFactory getTokenizer
Tokenizer<T> getTokenizer(Reader r);
From source file:com.left8.evs.preprocessingmodule.nlp.Tokenizer.java
License:Open Source License
/** * Constructor with minimum parameters. It only tokenizes a given String * without removing stopwords, name handles etc. * @param config A Config object.//w w w . j a v a 2 s . c om * @param text The text to be tokenized. */ public Tokenizer(Config config, String text) { this.config = config; TokenizerFactory<Word> tf = PTBTokenizer.factory(); List<Word> tokens = tf.getTokenizer(new StringReader(text)).tokenize(); for (Word token : tokens) { cleanTokens.add(token.toString()); } // String[] tokens = text.split(" "); // cleanTokens.addAll(Arrays.asList(tokens)); }
From source file:com.left8.evs.preprocessingmodule.nlp.Tokenizer.java
License:Open Source License
/** * Public constructor. It tokenizes a given String and separates hashtags, * name handles, URLs and stopwords and stores them into different lists. * @param config A Config object./*from w w w.j av a 2 s . c om*/ * @param text The text to be tokenized. * @param sw A StopWords handle. */ public Tokenizer(Config config, String text, StopWords sw) { TokenizerFactory<Word> tf = PTBTokenizer.factory(); List<Word> tokens = tf.getTokenizer(new StringReader(text)).tokenize(); this.config = config; numberOfTokens = tokens.size(); tokens.stream().map((word) -> word.toString()).forEach((token) -> { if (isHashtag(token)) { hashtags.add(token); cleanTokensAndHashtags.add(token.replace("#", "")); //Remove '#' } else if (isNameHandle(token)) { nameHandles.add(token.replace("@", "")); //Remove '@' character } else if (isURL(token)) { urls.add(token); } else if (sw.isStopWord(token)) { //Common stopwords stopWords.add(token); } else if (isCommonSymbol(token)) { //Common symbolsAndNonPrintableChars not caught before symbolsAndNonPrintableChars.add(token); } else if (sw .isNonPrintableCharacter("\\u" + Integer.toHexString(token.toCharArray()[0]).substring(1))) { //Non printable characters symbolsAndNonPrintableChars.add(token); } else { cleanTokens.add(token); cleanTokensAndHashtags.add(token); } }); }
From source file:csav2.pkg0.ParserTagging.java
public String dependency(String text) { String output = ""; Annotation document = new Annotation(text); try {// ww w .j av a 2 s .c o m pipeline.annotate(document); } catch (Exception e) { System.out.println("Exception while calling method annotate(Method dependency):" + e); } List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { TokenizerFactory tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List wordList = tokenizerFactory.getTokenizer(new StringReader(sentence.toString())).tokenize(); //System.out.println(wordList.toString()); Tree tree = lp.apply(wordList); GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection cc = gs.typedDependenciesCCprocessed(true); output += cc.toString() + "\n"; } //System.out.println(output); return output; }
From source file:DependencyParser.Parser.java
public void CallParser(String text) // start of the main method { try {/*from w w w. j a v a2 s.c om*/ TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); LexicalizedParser lp = LexicalizedParser .loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" }); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize(); Tree tree = lp.apply(wordList); GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true); System.out.println(tdl); PrintWriter pw = new PrintWriter("H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\Text-Parsed.txt"); TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(tree, pw); pw.close(); Main.writeImage(tree, tdl, "H:\\Thesis Development\\Thesis\\NLP\\src\\nlp\\image.png", 3); assert (new File("image.png").exists()); } catch (FileNotFoundException f) { } catch (Exception ex) { Logger.getLogger(Parser.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:Engines.Test.StanfordParser.TreeHandling.java
License:Open Source License
public static void test(String text) { TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); lp.setOptionFlags(new String[] { "-maxLength", "500", "-retainTmpSubcategories" }); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize(); Tree tree = lp.apply(wordList);/* ww w . jav a 2 s . c om*/ GrammaticalStructure gs = gsf.newGrammaticalStructure(tree); Collection<TypedDependency> tdl = gs.typedDependenciesCCprocessed(true); }
From source file:englishparser.EnglishParser.java
/** * demoAPI demonstrates other ways of calling the parser with already * tokenized text, or in some cases, raw text that needs to be tokenized as * a single sentence. Output is handled with a TreePrint object. Note that * the options used when creating the TreePrint can determine what results * to print out. Once again, one can capture the output by passing a * PrintWriter to TreePrint.printTree./*from w w w . jav a2 s . com*/ */ public static void demoAPI(LexicalizedParser lp) { // This option shows parsing a list of correctly tokenized words String[] sent = { "This", "is", "an", "easy", "sentence", "." }; List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent); Tree parse = lp.apply(rawWords); parse.pennPrint(); System.out.println(); // This option shows loading and using an explicit tokenizer String sent2 = "This is another sentence."; TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tok = tokenizerFactory.getTokenizer(new StringReader(sent2)); List<CoreLabel> rawWords2 = tok.tokenize(); parse = lp.apply(rawWords2); TreebankLanguagePack tlp = new PennTreebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parse); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); System.out.println(); // You can also use a TreePrint object to print trees and dependencies TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed"); tp.printTree(parse); }
From source file:ie.pars.aclrdtec.fileutils.GetStatRawTextFile.java
License:Open Source License
public static void main(String[] ss) throws SAXException, ParserConfigurationException, IOException { String input = ss[0]; //path to the input folder GetFiles gf = new GetFiles(); gf.getCorpusFiles(input);// w ww . jav a2 s.c om List<String> annotationFiles = gf.getFiles(); System.out.println("There are " + annotationFiles.size() + " files to check!"); TokenizerFactory<Word> newTokenizerFactory = PTBTokenizer.PTBTokenizerFactory.newTokenizerFactory(); int sentenceNumber = 0; int wordSize = 0; for (String file : annotationFiles) { File f = new File(file); Document makeDOM = XMLMethod.makeDOM(file); NodeList elementsByTagName = makeDOM.getElementsByTagName("S"); sentenceNumber += elementsByTagName.getLength(); for (int i = 0; i < elementsByTagName.getLength(); i++) { String sentence = elementsByTagName.item(i).getTextContent(); StringReader sr = new StringReader(sentence); Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr); List<Word> tokenize = tokenizer.tokenize(); wordSize += tokenize.size(); } } System.out.println(sentenceNumber); System.out.println(wordSize); }
From source file:ie.pars.bnc.preprocess.ProcessNLP.java
License:Open Source License
private static StringBuilder parseTheSentence(String sentence, Morphology morphology, MaxentTagger posTagger, ParserGrammar parser, String sid) { TokenizerFactory<Word> newTokenizerFactory = PTBTokenizerFactory.newTokenizerFactory(); // TokenizerFactory<WordLemmaTag> tokenizerFactory; // TokenizerFactory<CoreLabel> factory = PTBTokenizer.factory(new CoreLabelTokenFactory() , ""); // TokenizerFactory<Word> factory1 = PTBTokenizer.factory(); StringBuilder results = new StringBuilder(); results.append("<s id='" + sid + "'>\n"); StringReader sr = new StringReader(sentence); Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr); List<Word> tokenize = tokenizer.tokenize(); List<TaggedWord> tagSentence = posTagger.tagSentence(tokenize); Tree parseTree = parser.parse(tagSentence); GrammaticalStructure gs = parser.getTLPParams().getGrammaticalStructure(parseTree, parser.treebankLanguagePack().punctuationWordRejectFilter(), parser.getTLPParams().typedDependencyHeadFinder()); Collection<TypedDependency> deps = gs.typedDependenciesCollapsedTree(); SemanticGraph depTree = new SemanticGraph(deps); for (int i = 0; i < tagSentence.size(); ++i) { int head = -1; String deprel = null;//from w w w. j a v a2s .c o m // if (depTree != null) { Set<Integer> rootSet = depTree.getRoots().stream().map(IndexedWord::index).collect(Collectors.toSet()); IndexedWord node = depTree.getNodeByIndexSafe(i + 1); if (node != null) { List<SemanticGraphEdge> edgeList = depTree.getIncomingEdgesSorted(node); if (!edgeList.isEmpty()) { assert edgeList.size() == 1; head = edgeList.get(0).getGovernor().index(); deprel = edgeList.get(0).getRelation().toString(); } else if (rootSet.contains(i + 1)) { head = 0; deprel = "ROOT"; } } // } // Write the token TaggedWord lexHead = null; if (head > 0) { lexHead = tagSentence.get(head - 1); } results.append(line(i + 1, tagSentence.get(i), morphology, head, deprel, lexHead)).append("\n"); } results.append("</s>\n"); return results; }
From source file:info.atmykitchen.basic_annotation_convert.ConvertToBIO.java
License:Open Source License
private static void convertFile(File file, String annotator, PrintWriter printer) throws ParserConfigurationException, IOException, Exception { System.out.println(file.getAbsolutePath()); AnnotationFile annotationFile = IOMethods.loadAnnotationFile(file); Map<Integer, List<Annotation>> annotationLstMap = annotationFile.getAnnotationMapSentence(); TokenizerFactory<Word> newTokenizerFactory = PTBTokenizer.PTBTokenizerFactory.newTokenizerFactory(); String currentLabel = ""; int previousEnd = 0; printer.println("<doc id=\"" + annotationFile.getAclid() + "\" title=\"" + annotationFile.getTitle() + "\" annotatorid=\"" + annotator + "\">"); for (int i = 0; i < annotationFile.getSentences().size(); i++) { String sid = (i + 1) + "-" + annotationFile.getAclid(); printer.println("<s id=\"" + sid + "\" annotatorid=\"" + annotator + "\">"); String sentence = annotationFile.getSentences().get(i); System.out.println(sentence); StringReader sr = new StringReader(sentence); Tokenizer<Word> tokenizer = newTokenizerFactory.getTokenizer(sr); List<Word> tokenize = tokenizer.tokenize(); List<TaggedWord> tagSentence = tagger.tagSentence(tokenize); List<Annotation> sentenceAnnotationList = new ArrayList<>(); if (annotationLstMap.containsKey(i)) { sentenceAnnotationList = annotationLstMap.get(i); }/*from w w w. ja va 2s . c o m*/ System.out.println(sentenceAnnotationList.size()); Collections.sort(sentenceAnnotationList, Annotation.sentnceOrderComp()); List<Integer> toEnd = new ArrayList(); for (int j = 0; j < tagSentence.size(); j++) { //to add <g/> gap tags if (j == 0) { previousEnd = tagSentence.get(j).endPosition(); } else { if (previousEnd == tagSentence.get(j).beginPosition()) { printer.println("<g/>"); } previousEnd = tagSentence.get(j).endPosition(); } int startoffset = tagSentence.get(j).beginPosition(); if (!toEnd.isEmpty()) { Collections.sort(toEnd); while (!toEnd.isEmpty() && startoffset >= toEnd.get(0)) { currentLabel = ""; //System.out.println("** "+toEnd.get(0)); printer.println("</term>"); toEnd.remove(0); } } // this is based on the fact that currently we do not have nested annotations, // while the inner annotations work assignin labels to them for ske engine is going to be a bit problamatic, the best solution is to use multivalue feature of ske but this is something to be dealt in the future if (!sentenceAnnotationList.isEmpty()) { while (!sentenceAnnotationList.isEmpty() && sentenceAnnotationList.get(0).getStartOffsetSentence() <= startoffset) { Annotation remove = sentenceAnnotationList.remove(0); toEnd.add(remove.getStartOffsetSentence() + remove.getContent().length()); printer.println("<term class=\"" + remove.getType() + "\" id=\"" + j + "-" + sid + "\" annotatorid=\"" + annotator + "\">"); currentLabel = remove.getType(); } } printer.println( sentence.substring(tagSentence.get(j).beginPosition(), tagSentence.get(j).endPosition()) + "\t" + m.lemma(tagSentence.get(j).word(), tagSentence.get(j).tag()) + "\t" + tagSentence.get(j).tag()); } printer.println("</s>"); } printer.println("</doc>"); }
From source file:io.anserini.qa.RetrieveSentences.java
License:Apache License
public void getRankedPassages(Args args) throws Exception { Map<String, Float> scoredDocs = retrieveDocuments(args); Map<String, Float> sentencesMap = new LinkedHashMap<>(); IndexUtils util = new IndexUtils(args.index); TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); for (Map.Entry<String, Float> doc : scoredDocs.entrySet()) { List<Sentence> sentences = util.getSentDocument(doc.getKey()); for (Sentence sent : sentences) { List<CoreLabel> tokens = tokenizerFactory.getTokenizer(new StringReader(sent.text())).tokenize(); String answerTokens = tokens.stream().map(CoreLabel::toString).collect(Collectors.joining(" ")); sentencesMap.put(answerTokens, doc.getValue()); }// ww w .j ava2 s .c om } String queryTokens = tokenizerFactory.getTokenizer(new StringReader(args.query)).tokenize().stream() .map(CoreLabel::toString).collect(Collectors.joining(" ")); scorer.score(queryTokens, sentencesMap); List<ScoredPassage> topPassages = scorer.extractTopPassages(); for (ScoredPassage s : topPassages) { System.out.println(s.getSentence() + " " + s.getScore()); } }