List of usage examples for edu.stanford.nlp.util.logging RedwoodConfiguration empty
public static RedwoodConfiguration empty()
From source file:com.github.sharispe.slib.dsm.utils.StanfordLemmatizer.java
License:Open Source License
/** * Lemmatize a document and save the result in another file * @param inputFile the file to lemmatize * @param outputFile the result //from w w w . ja va 2 s.co m * @param path_to_pos_model the path to the POS model to consider * @throws IOException if an IO error occurs */ public static void lemmatize(String inputFile, String outputFile, String path_to_pos_model) throws IOException { // https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html String[] pennTags = { "NN", "NNS", "NNP", "VB" }; List<String> acceptedPennTag = Arrays.asList(pennTags); String textContent = readFile(inputFile, StandardCharsets.UTF_8); String textContentProcess = ""; // To remove the annoying log RedwoodConfiguration.empty().capture(System.err).apply(); Properties props = new Properties(); props.put("pos.model", path_to_pos_model); props.put("annotators", "tokenize, ssplit, pos, lemma"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // create an empty Annotation just with the given text Annotation document = new Annotation(textContent); // run all Annotators on this text pipeline.annotate(document); List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); String sentenceLem; for (CoreMap sentence : sentences) { sentenceLem = ""; boolean f = true; for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { String lemma = token.get(CoreAnnotations.LemmaAnnotation.class); String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class); if (acceptedPennTag.contains(pos)) { if (!f) { sentenceLem += " "; } sentenceLem += lemma; f = false; } } textContentProcess += sentenceLem + "\n"; } // enable log RedwoodConfiguration.current().clear().apply(); FileUtils.writeStringToFile(new File(outputFile), textContentProcess, false); }
From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnescorer.StanfordNlpNeScorerNodeModel.java
License:Open Source License
/** * {@inheritDoc}/*w ww.j av a 2s . co m*/ */ @Override protected PortObject[] execute(final PortObject[] inObjects, final ExecutionContext exec) throws Exception { m_inputModelPortObject = (StanfordNERModelPortObject) inObjects[1]; m_inputModel = m_inputModelPortObject.getNERModel(); m_usedDict = m_inputModelPortObject.getDictSet(); m_tag = m_inputModelPortObject.getTag(); m_tokenizerName = m_inputModelPortObject.getTokenizerName(); //create a BufferedDataContainer for the scoring values BufferedDataContainer accTable = exec.createDataContainer(new DataTableSpec(QUALITY_MEASURES_SPECS)); // build pattern set from dictionary DataTableSpec docTableSpec = (DataTableSpec) inObjects[0].getSpec(); BufferedDataTable docDataInput = (BufferedDataTable) inObjects[0]; Set<Pattern> knownEntitiesPatternSet = new LinkedHashSet<Pattern>(); for (String word : m_usedDict) { knownEntitiesPatternSet.add(Pattern.compile(word)); } // create dictionary tagger to tag the input documents with the dictionary used for building the model MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet, m_tag, true, m_tokenizerName); // create UUID to add them to the file path to avoid cases where two instances of the node model used the same file path at the same time String tempDir = KNIMEConstants.getKNIMETempDir() + "/"; String m_annotatedTestFilePath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv"; // create the annotated test file File m_annotatedTestFile = new File(m_annotatedTestFilePath); PrintWriter sentenceFileWriter = new PrintWriter(m_annotatedTestFile, "UTF-8"); int missingValueCounter = 0; // tag documents and transform sentences to strings while tagged terms get StanfordNLP annotation // iterate through columns for (int i = 0; i < docTableSpec.getNumColumns(); i++) { // iterate through rows if column with correct name has been found if (docTableSpec.getColumnSpec(i).getName().equals(m_docColumnModel.getStringValue())) { int counter = 0; Set<String> countMultiWordTerms = new HashSet<String>(); for (DataRow row : docDataInput) { //set progress bar counter++; double progress = (counter / (double) docDataInput.size()) / (3.0); exec.setProgress(progress, "Preparing documents for validation"); exec.checkCanceled(); if (!row.getCell(i).isMissing() && row.getCell(i).getType().isCompatible(DocumentValue.class)) { Document doc = ((DocumentValue) row.getCell(i)).getDocument(); Document taggedDoc = tagger.tag(doc); Iterator<Sentence> si = taggedDoc.sentenceIterator(); while (si.hasNext()) { Sentence s = si.next(); List<Term> termList = s.getTerms(); Iterator<Term> ti = termList.iterator(); while (ti.hasNext()) { Term t = ti.next(); String termText = t.getText(); String termTextWithWsSuffix = t.getTextWithWsSuffix(); if (m_usedDict.contains(termText) || m_usedDict.contains(termTextWithWsSuffix)) { if (t.getWords().size() > 1) { // multi-word terms should not be written in one line in the training file countMultiWordTerms.add(t.getText()); // so skip it by splitting the term and writing each word in one line for (Word w : t.getWords()) { sentenceFileWriter.println(w.getText() + "\t" + m_tag.getTagValue()); } } else { sentenceFileWriter.println(termText + "\t" + m_tag.getTagValue()); } } else if (!m_usedDict.contains(termText) || !m_usedDict.contains(termTextWithWsSuffix)) { sentenceFileWriter.println(termText + "\tO"); } } } } else { missingValueCounter++; } } } } if (missingValueCounter == 1) { setWarningMessage(missingValueCounter + " row has been ignored due to missing value."); } else if (missingValueCounter > 1) { setWarningMessage(missingValueCounter + " rows have been ignored due to missing values."); } sentenceFileWriter.close(); exec.setProgress(0.5, "Validate model"); // create logger configuration and catch the scores which will be printed to the log file File tmpLogFile = new File(KNIMEConstants.getKNIMETempDir() + "/scores.log"); RedwoodConfiguration conf = RedwoodConfiguration.empty(); conf.handlers(Handlers.chain(Handlers.hideDebug, Handlers.file(tmpLogFile))).apply(); // classify the documents with our model DocumentReaderAndWriter<CoreLabel> raw = m_inputModel.makeReaderAndWriter(); Triple<Double, Double, Double> prfScores = m_inputModel.classifyAndWriteAnswers(m_annotatedTestFilePath, new ByteArrayOutputStream(), raw, true); DataRow stats = new DefaultRow(new RowKey("Row0"), new DataCell[] { DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell() }); ReversedLinesFileReader logReader = new ReversedLinesFileReader(tmpLogFile, StandardCharsets.UTF_8); try { // get values from output stream String[] scores = logReader.readLine().split("\t"); if (scores.length >= 7) { Double precision = prfScores.first() / 100; Double recall = prfScores.second() / 100; Double f1 = prfScores.third() / 100; int tp = Integer.parseInt(scores[4].trim()); int fp = Integer.parseInt(scores[5].trim()); int fn = Integer.parseInt(scores[6].trim()); // create the scores row and add it to the BufferedDataContainer we created in the beginning stats = new DefaultRow(new RowKey("Row0"), new DataCell[] { new DoubleCell(precision), new DoubleCell(recall), new DoubleCell(f1), new IntCell(tp), new IntCell(fp), new IntCell(fn) }); if (tp == 0 && fp == 0 && fn == 0 && precision == 0 && recall == 1 && f1 == 0) { setWarningMessage("Could not parse quality measures of model validation."); } } } catch (NumberFormatException e) { setWarningMessage("Could not parse quality measures of model validation."); } finally { logReader.close(); tmpLogFile.delete(); m_annotatedTestFile.delete(); } accTable.addRowToTable(stats); accTable.close(); return new BufferedDataTable[] { accTable.getTable() }; }
From source file:org.lambda3.tagger.TopLevelTagger.java
License:Open Source License
private List<List<String>> split(List<String> sentences, boolean verbose) throws IOException { if (verbose) { System.out.println("Splitting sentences..."); }/*from w w w .j av a2 s . c o m*/ List<List<String>> chunksLists = new ArrayList<>(); //Word stemmer WordnetStemmer stemmer = new WordnetStemmer(dict); //POS tagger RedwoodConfiguration.empty().capture(System.err).apply(); MaxentTagger tagger = new MaxentTagger( "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"); RedwoodConfiguration.current().clear().apply(); for (String text : sentences) { List<String> chunks = new ArrayList<String>(); text = text.replaceAll("''", "\"").replaceAll("[\\W&&[^-']]", " ").replaceAll("[\\s]+", " ").trim(); //Replace all non-alphanumerics but dashes and single apostrophes by blanks String entry = text.replaceAll(" ", "_"); String currentEntry = entry; IWord word; String synsetID; String chunk; //Scans the sentence from left to right. Initially, the whole sentence is considered an entry; //if it is not found in WN, the leftmost word is recursively removed until a valid entry is identified while (entry.length() > 0) { while (entry.length() >= 1) { boolean skip = false; boolean isVerbForm = false; POS pos = POS.NOUN; String newEntry = entry; List<String> wordStems = stemmer.findStems(entry, pos); //Get the word/phrase stem if (wordStems.size() > 0) { newEntry = wordStems.get(0); } if (!entry.contains("_")) { //a single word //Get the POS tag String tagged = tagger.tagString(entry); String pt = tagged.substring(tagged.indexOf('_') + 1, tagged.length()).trim(); if (!validPOS.contains(pt)) { //not a noun, verb, adjective or adverb chunk = entry + ";00000000;null"; chunks.add(chunk); entry = removeLastWords(currentEntry, 1); currentEntry = entry; skip = true; break; } else { if (verbForm.contains(pt)) { //ensure that words that are both a noun and a verb will be correctly located if the POS tagger has already classified them as verbs pos = POS.VERB; wordStems = stemmer.findStems(entry, pos); //Get the verb stem if (wordStems.size() > 0) { newEntry = wordStems.get(0); } isVerbForm = true; } } } if (!skip) { if (isVerbForm) { //single-word verbs IIndexWord words = dict.getIndexWord(newEntry, pos); try { word = dict.getWord(words.getWordIDs().get(0)); synsetID = word.getSynset().getID().toString(); chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";verb"; chunks.add(chunk); entry = removeLastWords(currentEntry, entry.contains("_") ? entry.split("_").length : 1); currentEntry = entry; break; } catch (NullPointerException npen) { //verb not in WordNet chunk = entry + ";00000000;null"; chunks.add(chunk); entry = removeLastWords(currentEntry, 1); currentEntry = entry; break; } } else { //single-word nouns, adjectives and adverbs, and all multiple-words expressions IIndexWord nouns = dict.getIndexWord(newEntry, POS.NOUN); try { word = dict.getWord(nouns.getWordIDs().get(0)); synsetID = word.getSynset().getID().toString(); chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";noun"; chunks.add(chunk); entry = removeLastWords(currentEntry, entry.contains("_") ? entry.split("_").length : 1); currentEntry = entry; break; } catch (NullPointerException npen) { IIndexWord verbs = dict.getIndexWord(newEntry, POS.VERB); try { word = dict.getWord(verbs.getWordIDs().get(0)); synsetID = word.getSynset().getID().toString(); chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";verb"; chunks.add(chunk); entry = removeLastWords(currentEntry, entry.contains("_") ? entry.split("_").length : 1); currentEntry = entry; break; } catch (NullPointerException npev) { IIndexWord adjs = dict.getIndexWord(newEntry, POS.ADJECTIVE); try { word = dict.getWord(adjs.getWordIDs().get(0)); synsetID = word.getSynset().getID().toString(); chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";null"; chunks.add(chunk); entry = removeLastWords(currentEntry, entry.contains("_") ? entry.split("_").length : 1); currentEntry = entry; break; } catch (NullPointerException npea) { IIndexWord advs = dict.getIndexWord(newEntry, POS.ADVERB); try { word = dict.getWord(advs.getWordIDs().get(0)); synsetID = word.getSynset().getID().toString(); chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";null"; chunks.add(chunk); entry = removeLastWords(currentEntry, entry.contains("_") ? entry.split("_").length : 1); currentEntry = entry; break; } catch (NullPointerException nper) { // word not found in any grammatical class if (entry.contains("_")) { entry = entry.substring(entry.indexOf("_") + 1, entry.length()); } else { chunk = entry.replaceAll("_", " ") + ";00000000;null"; chunks.add(chunk); entry = removeLastWords(currentEntry, 1); currentEntry = entry; break; } } } } } } } } } chunksLists.add(chunks); } return chunksLists; }