Example usage for edu.stanford.nlp.tagger.maxent MaxentTagger tokenizeText

List of usage examples for edu.stanford.nlp.tagger.maxent MaxentTagger tokenizeText

Introduction

In this page you can find the example usage for edu.stanford.nlp.tagger.maxent MaxentTagger tokenizeText.

Prototype

public static List<List<HasWord>> tokenizeText(Reader r) 

Source Link

Document

Reads data from r, tokenizes it with the default (Penn Treebank) tokenizer, and returns a List of Sentence objects, which can then be fed into tagSentence.

Usage

From source file:asap.textprocessing.TextProcessPOSTagsStanford.java

/**
 * Uses loaded tagger model to calculate POS tags for the given sentence
 * tokens/*from w  w  w  .ja v  a2 s .  c  o  m*/
 *
 * @param tokens
 * @return tags
 */
@Override
protected synchronized String[] getTags(String[] tokens) {
    String sentence = "";
    for (int i = 0; i < tokens.length; i++) {
        String token = tokens[i];
        sentence += token;
        if (i + 1 < tokens.length) {
            sentence += " ";
        }

    }

    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(sentence));
    String tags[] = null;
    for (List<HasWord> sentenceL : sentences) {
        List<TaggedWord> taggedSentence = tagger.tagSentence(sentenceL);
        tags = new String[taggedSentence.size()];
        for (int j = 0; j < taggedSentence.size(); j++) {
            TaggedWord taggedWord = taggedSentence.get(j);
            tags[j] = taggedWord.tag();
        }
    }
    return tags;
}

From source file:cc.clabs.stratosphere.mlp.contracts.SentenceEmitter.java

License:BEER-WARE LICENSE

@Override
public void map(Record record, Collector<Record> collector) {
    target.clear();/*www.  j  av  a  2s  .  com*/
    // field 0 remains the same (id of the document)
    target.setField(0, record.getField(0, IntValue.class));
    String plaintext = record.getField(1, StringValue.class).getValue();
    // tokenize the plaintext
    List<List<HasWord>> tokenized = MaxentTagger.tokenizeText(new StringReader(plaintext));
    Integer position = -1;
    for (List<HasWord> tokens : tokenized) {
        position += 1;
        // for each detected sentence
        PactSentence sentence = new PactSentence();
        // populate a wordlist/sentence
        for (TaggedWord word : tagger.tagSentence(tokens))
            sentence.add(new PactWord(word));
        // postprocess the sentence
        sentence = SentenceUtils.joinByTagPattern(sentence, "\" * \"", "ENTITY");
        sentence = SentenceUtils.replaceAllByTag(sentence, "ENTITY", "[\"]", "");
        sentence = SentenceUtils.replaceAllByTag(sentence, "ENTITY", "^ | $", "");
        sentence = SentenceUtils.replaceAllByPattern(sentence, "MATH[0-9A-F]+", "FORMULA");
        // emit the final sentence
        target.setField(1, sentence);
        target.setField(2, new DoubleValue((double) position / (double) tokenized.size()));
        collector.collect(target);
    }
}

From source file:corpusProcessor.TokenCorpusXmlBuilder.java

License:Open Source License

public void characters(char[] buf, int offset, int len) throws SAXException {
    posTokenBuffer = new StringBuffer();
    String s = new String(buf, offset, len);

    if (textBuffer == null) {
        textBuffer = new StringBuffer(s);
    } else {//from   w w  w.  ja v a  2  s. c o m
        textBuffer.append(s);
    }
    String model = "/project/nlp/dingcheng/nlplab/models/bidirectional-wsj-0-18.tagger";
    try {
        MaxentTagger tagger = new MaxentTagger(model);
        List<Sentence<? extends HasWord>> sentences = MaxentTagger
                .tokenizeText(new StringReader(textBuffer.toString()));
        for (Sentence<? extends HasWord> sentence : sentences) {
            Sentence<TaggedWord> tSentence = MaxentTagger.tagSentence(sentence);
            //out.append(tSentence.toString(false));
            String[] tokenPosArray = tSentence.toString(false).split(" ");
            //for(int i=0;i<tokenPosArray.length;i++)
            for (String tokenPosStr : tokenPosArray) {
                String[] tokenPosPair = tokenPosStr.split("/");

                posTokenBuffer.append("<TOKEN id=\"" + tokenCount + "\" pos=\"" + tokenPosPair[1] + "\">"
                        + tokenPosPair[0] + "</TOKEN>");
                posTokenBuffer.append("\n");
                //emit("<TOKEN id=\""+tokenCount);
                tokenCount++;
            }

            //posTokenBuffer.append(tSentence.toString(false));
            //System.out.println(tSentence.toString(false));
        }
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

}

From source file:edu.cmu.geolocator.nlp.ner.FeatureExtractor.ACE_En_FeatureGenerator.java

License:Apache License

/**
 * MAIN FUNCTION FOR EXTRACTIN FEATURES/*from   w  w  w  .  j ava 2  s.co  m*/
 * 
 * @param t_tweet
 * @param trie
 * @param postags
 * @return FEATURE LISTS
 * @throws IOException 
 */

public ArrayList<ArrayList<Feature>> extractFeature(Sentence sent) throws IOException {

    int len = sent.tokenLength();

    ArrayList<ArrayList<Feature>> instances = new ArrayList<ArrayList<Feature>>(len);
    ArrayList<Feature> f = new ArrayList<Feature>();
    Token[] originalTokens = sent.getTokens();
    // lemmatize norm field. store in lemma field.// originally lemmat_tweet;
    // stored in lemma field
    //lemmatizer.lemmatize(sent);

    // pos tagging, originally postags. input is t_tweet
    // stored in pos field

    Reader r = new StringReader(sent.getSentenceString());
    List<List<HasWord>> POSsentences = MaxentTagger.tokenizeText(r);
    for (List<HasWord> POSsentence : POSsentences) {
        List<TaggedWord> tSentence = tagger.tagSentence(POSsentence);
        if (tSentence != null) {
            for (int i = 0; i < sent.getTokens().length && i < tSentence.size(); i++) {
                sent.getTokens()[i].setPOS(tSentence.get(i).toString());

            }
        }

        //System.out.println(edu.stanford.nlp.ling.Sentence.listToString(tSentence, false));
    }

    // normalize tweet norm_tweet
    for (int i = 0; i < len; i++)
        sent.getTokens()[i]
                .setNorm(StringUtil.getDeAccentLoweredString(tokentype(sent.getTokens()[i].getToken())));

    // String[] f_pos = postags.toArray(new String[] {});

    // f_gaz originally. filled in inGaz Field in token. check norm_tweet
    // field.
    // boolean[] f_gaz =
    // gazTag(tweetSentence, this.index);

    // use norm_tweet field to tag countries. don't remove f_country,
    // because it's not a type in
    // token.
    // boolean[] f_country = countryTag(tweetSentence);

    for (int i = 0; i < len; i++) {
        // clear feature list for this loop
        f = new ArrayList<Feature>();
        // /////////////////////////////// MORPH FEATURES
        // use lemma_tweet to get token features.
        genTokenFeatures(f, sent, i);
        // ////////////////////////////// SEMANTIC FEATURES
        genPosFeatures(f, sent, i);

        //genTagFeatures(f, originalTokens, i);

        genWordShapeFeatures(f, originalTokens, i);

        //genBrownClusterFeatures(f, originalTokens, i);

        // ////////////////////////////////// GAZ AND DICT LOOK UP
        // genGazFeatures(f, sent, i);
        // f7: STREET SUFFIX
        // f8 PREPOSITION

        // genCountryFeatures(f, f_country, i);

        // f10 directions

        // FEATURES are not stored in tweetsentence in advance. Those are
        // generated in those features.
        // use t_tweet to get cap.
        genCapFeatures(f, sent, i);

        // use t_tweet to generate preposition tags.
        genPrepFeatures(f, sent, i, preposition);

        //         genSuffixFeatures(f, sent, i);
        // f9: COUNTRY
        // f11: DISTANCE
        // f12: STOPWORDS
        // f13: BUILDING

        instances.add(f);
        //System.out.println(f);
    }

    return instances;

}

From source file:edu.cmu.geolocator.nlp.ner.FeatureExtractor.SkipChainFeature.java

License:Apache License

/**
 * MAIN FUNCTION FOR EXTRACTIN FEATURES/*from  w  w  w . j a v  a  2 s  .co  m*/
 * 
 * @param t_tweet
 * @param trie
 * @param postags
 * @return FEATURE LISTS
 * @throws IOException 
 */

public ArrayList<ArrayList<Feature>> extractFeature(Sentence sent) throws IOException {

    int len = sent.tokenLength();

    ArrayList<ArrayList<Feature>> instances = new ArrayList<ArrayList<Feature>>(len);
    ArrayList<Feature> f = new ArrayList<Feature>();
    Token[] originalTokens = sent.getTokens();
    // lemmatize norm field. store in lemma field.// originally lemmat_tweet;
    // stored in lemma field
    //lemmatizer.lemmatize(sent);

    // pos tagging, originally postags. input is t_tweet
    // stored in pos field

    Reader r = new StringReader(sent.getSentenceString());
    List<List<HasWord>> POSsentences = MaxentTagger.tokenizeText(r);
    for (List<HasWord> POSsentence : POSsentences) {
        List<TaggedWord> tSentence = tagger.tagSentence(POSsentence);
        if (tSentence != null) {
            for (int i = 0; i < sent.getTokens().length && i < tSentence.size(); i++) {
                sent.getTokens()[i].setPOS(tSentence.get(i).toString());

            }
        }

        //System.out.println(edu.stanford.nlp.ling.Sentence.listToString(tSentence, false));
    }

    // normalize tweet norm_tweet
    for (int i = 0; i < len; i++)
        sent.getTokens()[i]
                .setNorm(StringUtil.getDeAccentLoweredString(tokentype(sent.getTokens()[i].getToken())));

    // String[] f_pos = postags.toArray(new String[] {});

    // f_gaz originally. filled in inGaz Field in token. check norm_tweet
    // field.
    // boolean[] f_gaz =
    // gazTag(tweetSentence, this.index);

    // use norm_tweet field to tag countries. don't remove f_country,
    // because it's not a type in
    // token.
    // boolean[] f_country = countryTag(tweetSentence);

    for (int i = 0; i < len; i++) {
        // clear feature list for this loop
        f = new ArrayList<Feature>();
        // /////////////////////////////// MORPH FEATURES
        // use lemma_tweet to get token features.
        genTokenFeatures(f, sent, i);
        // ////////////////////////////// SEMANTIC FEATURES
        genPosFeatures(f, sent, i);

        //genTagFeatures(f, originalTokens, i);

        genWordShapeFeatures(f, originalTokens, i);
        // ////////////////////////////////// GAZ AND DICT LOOK UP
        // genGazFeatures(f, sent, i);
        // f7: STREET SUFFIX
        // f8 PREPOSITION

        // genCountryFeatures(f, f_country, i);

        // f10 directions

        // FEATURES are not stored in tweetsentence in advance. Those are
        // generated in those features.
        // use t_tweet to get cap.
        genCapFeatures(f, sent, i);

        // use t_tweet to generate preposition tags.
        genPrepFeatures(f, sent, i, preposition);

        //         genSuffixFeatures(f, sent, i);
        // f9: COUNTRY
        // f11: DISTANCE
        // f12: STOPWORDS
        // f13: BUILDING

        instances.add(f);
        //System.out.println(f);
    }

    return instances;

}

From source file:final_dissertation.POStag.java

public static void TextToXml(MaxentTagger mt, String input, String output, HashMap<Integer, String> sentences)
        throws IOException {
    String str;//from  ww  w  .  java2s  .c o m
    File fc = new File("E:\\Disertation\\Reviews\\Texts\\" + input);
    FileWriter file;
    file = new FileWriter("E:\\Disertation\\Reviews\\XMLs\\" + output);
    try (FileReader fl = new FileReader(fc)) {
        BufferedReader bf = new BufferedReader(fl);
        StringReader reader;
        int sentenceNum = 0;
        file.write("<");
        file.write(XMLUtils.escapeElementXML("Text"));
        file.write(">");

        while ((str = bf.readLine()) != null) {
            sentences.put(sentenceNum, str);
            reader = new StringReader(str);
            for (List sentence : MaxentTagger.tokenizeText(reader)) {

                ArrayList<TaggedWord> taggedSentence = mt.tagSentence(sentence);
                file.write(getXMLWords(taggedSentence, sentenceNum));
                sentenceNum++;
            }

        }
        file.write(System.lineSeparator());
        file.write("<");
        file.write(XMLUtils.escapeElementXML("/Text"));
        file.write(">");
        file.close();
    }
}

From source file:flight_ranker.TaggerDemo.java

public static void main(String[] args) throws Exception {
    //    if (args.length != 2) {
    //      System.err.println("usage: java TaggerDemo modelFile fileToTag");
    //      return;
    //    }//  w  ww.  j  a  v a2  s . c o m
    MaxentTagger tagger = new MaxentTagger("taggers\\english-left3words-distsim.tagger");
    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader("G:\\t.txt")));
    for (List<HasWord> sentence : sentences) {
        List<TaggedWord> tSentence = tagger.tagSentence(sentence);
        System.out.println(Sentence.listToString(tSentence, false));
    }
}

From source file:it.cnr.jatecs.nlp.utils.StanfordPOSTagger.java

License:Open Source License

public Vector<ArrayList<TaggedWord>> tag(String input) {
    Vector<ArrayList<TaggedWord>> returnVector = new Vector<ArrayList<TaggedWord>>();
    List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new StringReader(input)));
    for (List<? extends HasWord> sentence : sentences) {
        returnVector.add(tagger.tagSentence(sentence));
    }/*from ww  w.j av a 2  s  .co m*/
    return returnVector;
}

From source file:jp.xii.fn.jaen.JAENTokenizer.java

License:Apache License

/**
 * tokenize English string//from w  w w  .  j  a v a  2  s.c  o m
 * @param str
 * @return token list
 */
public List<JAENToken> tokenizeEN(String str) {
    List<JAENToken> results = new ArrayList<JAENToken>();
    for (List<HasWord> sentence : MaxentTagger.tokenizeText(new StringReader(str))) {
        for (TaggedWord tw : tagger.tagSentence(sentence)) {
            results.add(new JAENToken(tw.word().replaceAll("\\\\/", "/"), tw.tag()));
        }
    }
    return results;
}

From source file:net.sourceforge.doddle_owl.ui.InputDocumentSelectionPanel.java

License:Open Source License

private String runStanfordParser(File docFile) {
    File dir = new File(STANFORD_PARSER_MODELS_HOME);
    if (!dir.exists()) {
        dir.mkdir();/*from  ww w  .  j a v  a2s. c o m*/
    }
    BufferedWriter bw = null;
    StringBuilder builder = new StringBuilder();
    try {
        String modelName = "english-left3words-distsim.tagger";
        String modelPath = STANFORD_PARSER_MODELS_HOME + File.separator + modelName;
        File modelFile = new File(modelPath);
        if (!modelFile.exists()) {
            URL url = DODDLE_OWL.class.getClassLoader()
                    .getResource(Utils.RESOURCE_DIR + "stanford_parser_models/" + modelName);
            if (url != null) {
                FileUtils.copyURLToFile(url, modelFile);
                // System.out.println("copy: " +
                // modelFile.getAbsolutePath());
            }
        }
        bw = new BufferedWriter(new OutputStreamWriter(
                new FileOutputStream(STANFORD_PARSER_MODELS_HOME + File.separator + "tmpTagger.txt"), "UTF-8"));
        MaxentTagger tagger = new MaxentTagger(modelFile.getAbsolutePath());
        List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(docFile)));
        for (List<HasWord> sentence : sentences) {
            List<TaggedWord> tSentence = tagger.tagSentence(sentence);
            bw.write(Sentence.listToString(tSentence, false));
            builder.append(Sentence.listToString(tSentence, false));
        }
        bw.close();
    } catch (IOException ioe) {
        DODDLE_OWL.getLogger().log(Level.DEBUG, "Stanford Parser can not be executed.");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        try {
            if (bw != null) {
                bw.close();
            }
        } catch (IOException ioe2) {
            ioe2.printStackTrace();
        }
    }
    return builder.toString();
}