Example usage for edu.stanford.nlp.util.logging RedwoodConfiguration empty

List of usage examples for edu.stanford.nlp.util.logging RedwoodConfiguration empty

Introduction

In this page you can find the example usage for edu.stanford.nlp.util.logging RedwoodConfiguration empty.

Prototype

public static RedwoodConfiguration empty() 

Source Link

Document

An empty Redwood configuration.

Usage

From source file:com.github.sharispe.slib.dsm.utils.StanfordLemmatizer.java

License:Open Source License

/**
 * Lemmatize a document and save the result in another file
 * @param inputFile the file to lemmatize
 * @param outputFile the result //from   w  w  w  . ja  va 2 s.co m
 * @param path_to_pos_model the path to the POS model to consider
 * @throws IOException if an IO error occurs
 */
public static void lemmatize(String inputFile, String outputFile, String path_to_pos_model) throws IOException {

    // https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    String[] pennTags = { "NN", "NNS", "NNP", "VB" };
    List<String> acceptedPennTag = Arrays.asList(pennTags);
    String textContent = readFile(inputFile, StandardCharsets.UTF_8);
    String textContentProcess = "";

    // To remove the annoying log
    RedwoodConfiguration.empty().capture(System.err).apply();

    Properties props = new Properties();
    props.put("pos.model", path_to_pos_model);
    props.put("annotators", "tokenize, ssplit, pos, lemma");

    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(textContent);

    // run all Annotators on this text
    pipeline.annotate(document);

    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);

    String sentenceLem;

    for (CoreMap sentence : sentences) {
        sentenceLem = "";

        boolean f = true;
        for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {

            String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
            String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);

            if (acceptedPennTag.contains(pos)) {
                if (!f) {
                    sentenceLem += " ";
                }
                sentenceLem += lemma;
                f = false;
            }
        }
        textContentProcess += sentenceLem + "\n";
    }
    // enable log
    RedwoodConfiguration.current().clear().apply();
    FileUtils.writeStringToFile(new File(outputFile), textContentProcess, false);
}

From source file:org.knime.ext.textprocessing.nodes.tagging.stanfordnlpnescorer.StanfordNlpNeScorerNodeModel.java

License:Open Source License

/**
 * {@inheritDoc}/*w ww.j  av  a  2s  . co  m*/
 */
@Override
protected PortObject[] execute(final PortObject[] inObjects, final ExecutionContext exec) throws Exception {

    m_inputModelPortObject = (StanfordNERModelPortObject) inObjects[1];
    m_inputModel = m_inputModelPortObject.getNERModel();
    m_usedDict = m_inputModelPortObject.getDictSet();
    m_tag = m_inputModelPortObject.getTag();
    m_tokenizerName = m_inputModelPortObject.getTokenizerName();

    //create a BufferedDataContainer for the scoring values
    BufferedDataContainer accTable = exec.createDataContainer(new DataTableSpec(QUALITY_MEASURES_SPECS));

    // build pattern set from dictionary
    DataTableSpec docTableSpec = (DataTableSpec) inObjects[0].getSpec();
    BufferedDataTable docDataInput = (BufferedDataTable) inObjects[0];
    Set<Pattern> knownEntitiesPatternSet = new LinkedHashSet<Pattern>();
    for (String word : m_usedDict) {
        knownEntitiesPatternSet.add(Pattern.compile(word));
    }

    // create dictionary tagger to tag the input documents with the dictionary used for building the model
    MultiTermRegexDocumentTagger tagger = new MultiTermRegexDocumentTagger(true, knownEntitiesPatternSet, m_tag,
            true, m_tokenizerName);

    // create UUID to add them to the file path to avoid cases where two instances of the node model used the same file path at the same time
    String tempDir = KNIMEConstants.getKNIMETempDir() + "/";
    String m_annotatedTestFilePath = tempDir + "aD-" + UUID.randomUUID().toString() + ".tsv";

    // create the annotated test file
    File m_annotatedTestFile = new File(m_annotatedTestFilePath);
    PrintWriter sentenceFileWriter = new PrintWriter(m_annotatedTestFile, "UTF-8");

    int missingValueCounter = 0;

    // tag documents and transform sentences to strings while tagged terms get StanfordNLP annotation
    // iterate through columns
    for (int i = 0; i < docTableSpec.getNumColumns(); i++) {
        // iterate through rows if column with correct name has been found
        if (docTableSpec.getColumnSpec(i).getName().equals(m_docColumnModel.getStringValue())) {
            int counter = 0;
            Set<String> countMultiWordTerms = new HashSet<String>();
            for (DataRow row : docDataInput) {
                //set progress bar
                counter++;
                double progress = (counter / (double) docDataInput.size()) / (3.0);
                exec.setProgress(progress, "Preparing documents for validation");
                exec.checkCanceled();

                if (!row.getCell(i).isMissing() && row.getCell(i).getType().isCompatible(DocumentValue.class)) {
                    Document doc = ((DocumentValue) row.getCell(i)).getDocument();
                    Document taggedDoc = tagger.tag(doc);
                    Iterator<Sentence> si = taggedDoc.sentenceIterator();
                    while (si.hasNext()) {
                        Sentence s = si.next();
                        List<Term> termList = s.getTerms();
                        Iterator<Term> ti = termList.iterator();
                        while (ti.hasNext()) {
                            Term t = ti.next();
                            String termText = t.getText();
                            String termTextWithWsSuffix = t.getTextWithWsSuffix();
                            if (m_usedDict.contains(termText) || m_usedDict.contains(termTextWithWsSuffix)) {
                                if (t.getWords().size() > 1) {
                                    // multi-word terms should not be written in one line in the training file
                                    countMultiWordTerms.add(t.getText());

                                    // so skip it by splitting the term and writing each word in one line
                                    for (Word w : t.getWords()) {
                                        sentenceFileWriter.println(w.getText() + "\t" + m_tag.getTagValue());
                                    }
                                } else {
                                    sentenceFileWriter.println(termText + "\t" + m_tag.getTagValue());
                                }
                            } else if (!m_usedDict.contains(termText)
                                    || !m_usedDict.contains(termTextWithWsSuffix)) {
                                sentenceFileWriter.println(termText + "\tO");
                            }
                        }
                    }
                } else {
                    missingValueCounter++;
                }
            }
        }
    }

    if (missingValueCounter == 1) {
        setWarningMessage(missingValueCounter + " row has been ignored due to missing value.");
    } else if (missingValueCounter > 1) {
        setWarningMessage(missingValueCounter + " rows have been ignored due to missing values.");
    }

    sentenceFileWriter.close();

    exec.setProgress(0.5, "Validate model");
    // create logger configuration and catch the scores which will be printed to the log file
    File tmpLogFile = new File(KNIMEConstants.getKNIMETempDir() + "/scores.log");
    RedwoodConfiguration conf = RedwoodConfiguration.empty();
    conf.handlers(Handlers.chain(Handlers.hideDebug, Handlers.file(tmpLogFile))).apply();

    // classify the documents with our model
    DocumentReaderAndWriter<CoreLabel> raw = m_inputModel.makeReaderAndWriter();
    Triple<Double, Double, Double> prfScores = m_inputModel.classifyAndWriteAnswers(m_annotatedTestFilePath,
            new ByteArrayOutputStream(), raw, true);

    DataRow stats = new DefaultRow(new RowKey("Row0"),
            new DataCell[] { DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell(),
                    DataType.getMissingCell(), DataType.getMissingCell(), DataType.getMissingCell() });

    ReversedLinesFileReader logReader = new ReversedLinesFileReader(tmpLogFile, StandardCharsets.UTF_8);

    try {
        // get values from output stream
        String[] scores = logReader.readLine().split("\t");
        if (scores.length >= 7) {
            Double precision = prfScores.first() / 100;
            Double recall = prfScores.second() / 100;
            Double f1 = prfScores.third() / 100;
            int tp = Integer.parseInt(scores[4].trim());
            int fp = Integer.parseInt(scores[5].trim());
            int fn = Integer.parseInt(scores[6].trim());
            // create the scores row and add it to the BufferedDataContainer we created in the beginning
            stats = new DefaultRow(new RowKey("Row0"),
                    new DataCell[] { new DoubleCell(precision), new DoubleCell(recall), new DoubleCell(f1),
                            new IntCell(tp), new IntCell(fp), new IntCell(fn) });
            if (tp == 0 && fp == 0 && fn == 0 && precision == 0 && recall == 1 && f1 == 0) {
                setWarningMessage("Could not parse quality measures of model validation.");
            }
        }
    } catch (NumberFormatException e) {
        setWarningMessage("Could not parse quality measures of model validation.");
    } finally {
        logReader.close();
        tmpLogFile.delete();
        m_annotatedTestFile.delete();
    }
    accTable.addRowToTable(stats);

    accTable.close();

    return new BufferedDataTable[] { accTable.getTable() };
}

From source file:org.lambda3.tagger.TopLevelTagger.java

License:Open Source License

private List<List<String>> split(List<String> sentences, boolean verbose) throws IOException {

    if (verbose) {
        System.out.println("Splitting sentences...");
    }/*from w w w  .j  av a2  s . c o  m*/

    List<List<String>> chunksLists = new ArrayList<>();

    //Word stemmer
    WordnetStemmer stemmer = new WordnetStemmer(dict);

    //POS tagger
    RedwoodConfiguration.empty().capture(System.err).apply();
    MaxentTagger tagger = new MaxentTagger(
            "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
    RedwoodConfiguration.current().clear().apply();

    for (String text : sentences) {
        List<String> chunks = new ArrayList<String>();
        text = text.replaceAll("''", "\"").replaceAll("[\\W&&[^-']]", " ").replaceAll("[\\s]+", " ").trim(); //Replace all non-alphanumerics but dashes and single apostrophes by blanks

        String entry = text.replaceAll(" ", "_");

        String currentEntry = entry;
        IWord word;
        String synsetID;
        String chunk;

        //Scans the sentence from left to right. Initially, the whole sentence is considered an entry;
        //if it is not found in WN, the leftmost word is recursively removed until a valid entry is identified
        while (entry.length() > 0) {
            while (entry.length() >= 1) {
                boolean skip = false;
                boolean isVerbForm = false;
                POS pos = POS.NOUN;
                String newEntry = entry;

                List<String> wordStems = stemmer.findStems(entry, pos);

                //Get the word/phrase stem
                if (wordStems.size() > 0) {
                    newEntry = wordStems.get(0);
                }

                if (!entry.contains("_")) { //a single word
                    //Get the POS tag
                    String tagged = tagger.tagString(entry);
                    String pt = tagged.substring(tagged.indexOf('_') + 1, tagged.length()).trim();

                    if (!validPOS.contains(pt)) { //not a noun, verb, adjective or adverb
                        chunk = entry + ";00000000;null";
                        chunks.add(chunk);
                        entry = removeLastWords(currentEntry, 1);
                        currentEntry = entry;
                        skip = true;
                        break;
                    } else {
                        if (verbForm.contains(pt)) { //ensure that words that are both a noun and a verb will be correctly located if the POS tagger has already classified them as verbs
                            pos = POS.VERB;
                            wordStems = stemmer.findStems(entry, pos);

                            //Get the verb stem
                            if (wordStems.size() > 0) {
                                newEntry = wordStems.get(0);
                            }

                            isVerbForm = true;
                        }
                    }
                }

                if (!skip) {
                    if (isVerbForm) { //single-word verbs
                        IIndexWord words = dict.getIndexWord(newEntry, pos);
                        try {
                            word = dict.getWord(words.getWordIDs().get(0));
                            synsetID = word.getSynset().getID().toString();
                            chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";verb";
                            chunks.add(chunk);
                            entry = removeLastWords(currentEntry,
                                    entry.contains("_") ? entry.split("_").length : 1);
                            currentEntry = entry;
                            break;
                        } catch (NullPointerException npen) { //verb not in WordNet
                            chunk = entry + ";00000000;null";
                            chunks.add(chunk);
                            entry = removeLastWords(currentEntry, 1);
                            currentEntry = entry;
                            break;
                        }
                    } else { //single-word nouns, adjectives and adverbs, and all multiple-words expressions
                        IIndexWord nouns = dict.getIndexWord(newEntry, POS.NOUN);
                        try {
                            word = dict.getWord(nouns.getWordIDs().get(0));
                            synsetID = word.getSynset().getID().toString();
                            chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";noun";
                            chunks.add(chunk);
                            entry = removeLastWords(currentEntry,
                                    entry.contains("_") ? entry.split("_").length : 1);
                            currentEntry = entry;
                            break;
                        } catch (NullPointerException npen) {
                            IIndexWord verbs = dict.getIndexWord(newEntry, POS.VERB);
                            try {
                                word = dict.getWord(verbs.getWordIDs().get(0));
                                synsetID = word.getSynset().getID().toString();
                                chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";verb";
                                chunks.add(chunk);
                                entry = removeLastWords(currentEntry,
                                        entry.contains("_") ? entry.split("_").length : 1);
                                currentEntry = entry;
                                break;
                            } catch (NullPointerException npev) {
                                IIndexWord adjs = dict.getIndexWord(newEntry, POS.ADJECTIVE);
                                try {
                                    word = dict.getWord(adjs.getWordIDs().get(0));
                                    synsetID = word.getSynset().getID().toString();
                                    chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";null";
                                    chunks.add(chunk);
                                    entry = removeLastWords(currentEntry,
                                            entry.contains("_") ? entry.split("_").length : 1);
                                    currentEntry = entry;
                                    break;
                                } catch (NullPointerException npea) {
                                    IIndexWord advs = dict.getIndexWord(newEntry, POS.ADVERB);
                                    try {
                                        word = dict.getWord(advs.getWordIDs().get(0));
                                        synsetID = word.getSynset().getID().toString();
                                        chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";null";
                                        chunks.add(chunk);
                                        entry = removeLastWords(currentEntry,
                                                entry.contains("_") ? entry.split("_").length : 1);
                                        currentEntry = entry;
                                        break;
                                    } catch (NullPointerException nper) { // word not found in any grammatical class
                                        if (entry.contains("_")) {
                                            entry = entry.substring(entry.indexOf("_") + 1, entry.length());
                                        } else {
                                            chunk = entry.replaceAll("_", " ") + ";00000000;null";
                                            chunks.add(chunk);
                                            entry = removeLastWords(currentEntry, 1);
                                            currentEntry = entry;
                                            break;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        chunksLists.add(chunks);
    }
    return chunksLists;
}