List of usage examples for edu.stanford.nlp.util.logging RedwoodConfiguration current
public static RedwoodConfiguration current()
From source file:com.github.sharispe.slib.dsm.utils.StanfordLemmatizer.java
License:Open Source License
/** * Lemmatize a document and save the result in another file * @param inputFile the file to lemmatize * @param outputFile the result /*from ww w.j a va 2s . c om*/ * @param path_to_pos_model the path to the POS model to consider * @throws IOException if an IO error occurs */ public static void lemmatize(String inputFile, String outputFile, String path_to_pos_model) throws IOException { // https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html String[] pennTags = { "NN", "NNS", "NNP", "VB" }; List<String> acceptedPennTag = Arrays.asList(pennTags); String textContent = readFile(inputFile, StandardCharsets.UTF_8); String textContentProcess = ""; // To remove the annoying log RedwoodConfiguration.empty().capture(System.err).apply(); Properties props = new Properties(); props.put("pos.model", path_to_pos_model); props.put("annotators", "tokenize, ssplit, pos, lemma"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // create an empty Annotation just with the given text Annotation document = new Annotation(textContent); // run all Annotators on this text pipeline.annotate(document); List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); String sentenceLem; for (CoreMap sentence : sentences) { sentenceLem = ""; boolean f = true; for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { String lemma = token.get(CoreAnnotations.LemmaAnnotation.class); String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); if (acceptedPennTag.contains(pos)) { if (!f) { sentenceLem += " "; } sentenceLem += lemma; f = false; } } textContentProcess += sentenceLem + "\n"; } // enable log RedwoodConfiguration.current().clear().apply(); FileUtils.writeStringToFile(new File(outputFile), textContentProcess, false); }
From source file:org.lambda3.tagger.TopLevelTagger.java
License:Open Source License
private List<List<String>> split(List<String> sentences, boolean verbose) throws IOException { if (verbose) { System.out.println("Splitting sentences..."); }/* w w w. j a v a 2 s . co m*/ List<List<String>> chunksLists = new ArrayList<>(); //Word stemmer WordnetStemmer stemmer = new WordnetStemmer(dict); //POS tagger RedwoodConfiguration.empty().capture(System.err).apply(); MaxentTagger tagger = new MaxentTagger( "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"); RedwoodConfiguration.current().clear().apply(); for (String text : sentences) { List<String> chunks = new ArrayList<String>(); text = text.replaceAll("''", "\"").replaceAll("[\\W&&[^-']]", " ").replaceAll("[\\s]+", " ").trim(); //Replace all non-alphanumerics but dashes and single apostrophes by blanks String entry = text.replaceAll(" ", "_"); String currentEntry = entry; IWord word; String synsetID; String chunk; //Scans the sentence from left to right. Initially, the whole sentence is considered an entry; //if it is not found in WN, the leftmost word is recursively removed until a valid entry is identified while (entry.length() > 0) { while (entry.length() >= 1) { boolean skip = false; boolean isVerbForm = false; POS pos = POS.NOUN; String newEntry = entry; List<String> wordStems = stemmer.findStems(entry, pos); //Get the word/phrase stem if (wordStems.size() > 0) { newEntry = wordStems.get(0); } if (!entry.contains("_")) { //a single word //Get the POS tag String tagged = tagger.tagString(entry); String pt = tagged.substring(tagged.indexOf('_') + 1, tagged.length()).trim(); if (!validPOS.contains(pt)) { //not a noun, verb, adjective or adverb chunk = entry + ";00000000;null"; chunks.add(chunk); entry = removeLastWords(currentEntry, 1); currentEntry = entry; skip = true; break; } else { if (verbForm.contains(pt)) { //ensure that words that are both a noun and a verb will be correctly located if the POS tagger has already classified them as verbs pos = POS.VERB; wordStems = stemmer.findStems(entry, pos); //Get the verb stem if (wordStems.size() > 0) { newEntry = wordStems.get(0); } isVerbForm = true; } } } if (!skip) { if (isVerbForm) { //single-word verbs IIndexWord words = dict.getIndexWord(newEntry, pos); try { word = dict.getWord(words.getWordIDs().get(0)); synsetID = word.getSynset().getID().toString(); chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";verb"; chunks.add(chunk); entry = removeLastWords(currentEntry, entry.contains("_") ? entry.split("_").length : 1); currentEntry = entry; break; } catch (NullPointerException npen) { //verb not in WordNet chunk = entry + ";00000000;null"; chunks.add(chunk); entry = removeLastWords(currentEntry, 1); currentEntry = entry; break; } } else { //single-word nouns, adjectives and adverbs, and all multiple-words expressions IIndexWord nouns = dict.getIndexWord(newEntry, POS.NOUN); try { word = dict.getWord(nouns.getWordIDs().get(0)); synsetID = word.getSynset().getID().toString(); chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";noun"; chunks.add(chunk); entry = removeLastWords(currentEntry, entry.contains("_") ? entry.split("_").length : 1); currentEntry = entry; break; } catch (NullPointerException npen) { IIndexWord verbs = dict.getIndexWord(newEntry, POS.VERB); try { word = dict.getWord(verbs.getWordIDs().get(0)); synsetID = word.getSynset().getID().toString(); chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";verb"; chunks.add(chunk); entry = removeLastWords(currentEntry, entry.contains("_") ? entry.split("_").length : 1); currentEntry = entry; break; } catch (NullPointerException npev) { IIndexWord adjs = dict.getIndexWord(newEntry, POS.ADJECTIVE); try { word = dict.getWord(adjs.getWordIDs().get(0)); synsetID = word.getSynset().getID().toString(); chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";null"; chunks.add(chunk); entry = removeLastWords(currentEntry, entry.contains("_") ? entry.split("_").length : 1); currentEntry = entry; break; } catch (NullPointerException npea) { IIndexWord advs = dict.getIndexWord(newEntry, POS.ADVERB); try { word = dict.getWord(advs.getWordIDs().get(0)); synsetID = word.getSynset().getID().toString(); chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";null"; chunks.add(chunk); entry = removeLastWords(currentEntry, entry.contains("_") ? entry.split("_").length : 1); currentEntry = entry; break; } catch (NullPointerException nper) { // word not found in any grammatical class if (entry.contains("_")) { entry = entry.substring(entry.indexOf("_") + 1, entry.length()); } else { chunk = entry.replaceAll("_", " ") + ";00000000;null"; chunks.add(chunk); entry = removeLastWords(currentEntry, 1); currentEntry = entry; break; } } } } } } } } } chunksLists.add(chunks); } return chunksLists; }