Example usage for edu.stanford.nlp.util.logging RedwoodConfiguration current

List of usage examples for edu.stanford.nlp.util.logging RedwoodConfiguration current

Introduction

In this page you can find the example usage for edu.stanford.nlp.util.logging RedwoodConfiguration current.

Prototype

public static RedwoodConfiguration current() 

Source Link

Document

The current Redwood configuration; this is used to make incremental changes to an existing custom configuration.

Usage

From source file:com.github.sharispe.slib.dsm.utils.StanfordLemmatizer.java

License:Open Source License

/**
 * Lemmatize a document and save the result in another file
 * @param inputFile the file to lemmatize
 * @param outputFile the result /*from  ww w.j a va  2s . c om*/
 * @param path_to_pos_model the path to the POS model to consider
 * @throws IOException if an IO error occurs
 */
public static void lemmatize(String inputFile, String outputFile, String path_to_pos_model) throws IOException {

    // https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    String[] pennTags = { "NN", "NNS", "NNP", "VB" };
    List<String> acceptedPennTag = Arrays.asList(pennTags);
    String textContent = readFile(inputFile, StandardCharsets.UTF_8);
    String textContentProcess = "";

    // To remove the annoying log
    RedwoodConfiguration.empty().capture(System.err).apply();

    Properties props = new Properties();
    props.put("pos.model", path_to_pos_model);
    props.put("annotators", "tokenize, ssplit, pos, lemma");

    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

    // create an empty Annotation just with the given text
    Annotation document = new Annotation(textContent);

    // run all Annotators on this text
    pipeline.annotate(document);

    List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);

    String sentenceLem;

    for (CoreMap sentence : sentences) {
        sentenceLem = "";

        boolean f = true;
        for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {

            String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
            String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);

            if (acceptedPennTag.contains(pos)) {
                if (!f) {
                    sentenceLem += " ";
                }
                sentenceLem += lemma;
                f = false;
            }
        }
        textContentProcess += sentenceLem + "\n";
    }
    // enable log
    RedwoodConfiguration.current().clear().apply();
    FileUtils.writeStringToFile(new File(outputFile), textContentProcess, false);
}

From source file:org.lambda3.tagger.TopLevelTagger.java

License:Open Source License

private List<List<String>> split(List<String> sentences, boolean verbose) throws IOException {

    if (verbose) {
        System.out.println("Splitting sentences...");
    }/*  w w  w.  j  a  v a 2 s .  co m*/

    List<List<String>> chunksLists = new ArrayList<>();

    //Word stemmer
    WordnetStemmer stemmer = new WordnetStemmer(dict);

    //POS tagger
    RedwoodConfiguration.empty().capture(System.err).apply();
    MaxentTagger tagger = new MaxentTagger(
            "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
    RedwoodConfiguration.current().clear().apply();

    for (String text : sentences) {
        List<String> chunks = new ArrayList<String>();
        text = text.replaceAll("''", "\"").replaceAll("[\\W&&[^-']]", " ").replaceAll("[\\s]+", " ").trim(); //Replace all non-alphanumerics but dashes and single apostrophes by blanks

        String entry = text.replaceAll(" ", "_");

        String currentEntry = entry;
        IWord word;
        String synsetID;
        String chunk;

        //Scans the sentence from left to right. Initially, the whole sentence is considered an entry;
        //if it is not found in WN, the leftmost word is recursively removed until a valid entry is identified
        while (entry.length() > 0) {
            while (entry.length() >= 1) {
                boolean skip = false;
                boolean isVerbForm = false;
                POS pos = POS.NOUN;
                String newEntry = entry;

                List<String> wordStems = stemmer.findStems(entry, pos);

                //Get the word/phrase stem
                if (wordStems.size() > 0) {
                    newEntry = wordStems.get(0);
                }

                if (!entry.contains("_")) { //a single word
                    //Get the POS tag
                    String tagged = tagger.tagString(entry);
                    String pt = tagged.substring(tagged.indexOf('_') + 1, tagged.length()).trim();

                    if (!validPOS.contains(pt)) { //not a noun, verb, adjective or adverb
                        chunk = entry + ";00000000;null";
                        chunks.add(chunk);
                        entry = removeLastWords(currentEntry, 1);
                        currentEntry = entry;
                        skip = true;
                        break;
                    } else {
                        if (verbForm.contains(pt)) { //ensure that words that are both a noun and a verb will be correctly located if the POS tagger has already classified them as verbs
                            pos = POS.VERB;
                            wordStems = stemmer.findStems(entry, pos);

                            //Get the verb stem
                            if (wordStems.size() > 0) {
                                newEntry = wordStems.get(0);
                            }

                            isVerbForm = true;
                        }
                    }
                }

                if (!skip) {
                    if (isVerbForm) { //single-word verbs
                        IIndexWord words = dict.getIndexWord(newEntry, pos);
                        try {
                            word = dict.getWord(words.getWordIDs().get(0));
                            synsetID = word.getSynset().getID().toString();
                            chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";verb";
                            chunks.add(chunk);
                            entry = removeLastWords(currentEntry,
                                    entry.contains("_") ? entry.split("_").length : 1);
                            currentEntry = entry;
                            break;
                        } catch (NullPointerException npen) { //verb not in WordNet
                            chunk = entry + ";00000000;null";
                            chunks.add(chunk);
                            entry = removeLastWords(currentEntry, 1);
                            currentEntry = entry;
                            break;
                        }
                    } else { //single-word nouns, adjectives and adverbs, and all multiple-words expressions
                        IIndexWord nouns = dict.getIndexWord(newEntry, POS.NOUN);
                        try {
                            word = dict.getWord(nouns.getWordIDs().get(0));
                            synsetID = word.getSynset().getID().toString();
                            chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";noun";
                            chunks.add(chunk);
                            entry = removeLastWords(currentEntry,
                                    entry.contains("_") ? entry.split("_").length : 1);
                            currentEntry = entry;
                            break;
                        } catch (NullPointerException npen) {
                            IIndexWord verbs = dict.getIndexWord(newEntry, POS.VERB);
                            try {
                                word = dict.getWord(verbs.getWordIDs().get(0));
                                synsetID = word.getSynset().getID().toString();
                                chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";verb";
                                chunks.add(chunk);
                                entry = removeLastWords(currentEntry,
                                        entry.contains("_") ? entry.split("_").length : 1);
                                currentEntry = entry;
                                break;
                            } catch (NullPointerException npev) {
                                IIndexWord adjs = dict.getIndexWord(newEntry, POS.ADJECTIVE);
                                try {
                                    word = dict.getWord(adjs.getWordIDs().get(0));
                                    synsetID = word.getSynset().getID().toString();
                                    chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";null";
                                    chunks.add(chunk);
                                    entry = removeLastWords(currentEntry,
                                            entry.contains("_") ? entry.split("_").length : 1);
                                    currentEntry = entry;
                                    break;
                                } catch (NullPointerException npea) {
                                    IIndexWord advs = dict.getIndexWord(newEntry, POS.ADVERB);
                                    try {
                                        word = dict.getWord(advs.getWordIDs().get(0));
                                        synsetID = word.getSynset().getID().toString();
                                        chunk = entry.replaceAll("_", " ") + ";" + synsetID + ";null";
                                        chunks.add(chunk);
                                        entry = removeLastWords(currentEntry,
                                                entry.contains("_") ? entry.split("_").length : 1);
                                        currentEntry = entry;
                                        break;
                                    } catch (NullPointerException nper) { // word not found in any grammatical class
                                        if (entry.contains("_")) {
                                            entry = entry.substring(entry.indexOf("_") + 1, entry.length());
                                        } else {
                                            chunk = entry.replaceAll("_", " ") + ";00000000;null";
                                            chunks.add(chunk);
                                            entry = removeLastWords(currentEntry, 1);
                                            currentEntry = entry;
                                            break;
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        chunksLists.add(chunks);
    }
    return chunksLists;
}