List of usage examples for edu.stanford.nlp.tagger.maxent MaxentTagger tokenizeText
public static List<List<HasWord>> tokenizeText(Reader r)
From source file:asap.textprocessing.TextProcessPOSTagsStanford.java
/** * Uses loaded tagger model to calculate POS tags for the given sentence * tokens/*from w w w .ja v a2 s . c o m*/ * * @param tokens * @return tags */ @Override protected synchronized String[] getTags(String[] tokens) { String sentence = ""; for (int i = 0; i < tokens.length; i++) { String token = tokens[i]; sentence += token; if (i + 1 < tokens.length) { sentence += " "; } } List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new StringReader(sentence)); String tags[] = null; for (List<HasWord> sentenceL : sentences) { List<TaggedWord> taggedSentence = tagger.tagSentence(sentenceL); tags = new String[taggedSentence.size()]; for (int j = 0; j < taggedSentence.size(); j++) { TaggedWord taggedWord = taggedSentence.get(j); tags[j] = taggedWord.tag(); } } return tags; }
From source file:cc.clabs.stratosphere.mlp.contracts.SentenceEmitter.java
License:BEER-WARE LICENSE
@Override public void map(Record record, Collector<Record> collector) { target.clear();/*www. j av a 2s . com*/ // field 0 remains the same (id of the document) target.setField(0, record.getField(0, IntValue.class)); String plaintext = record.getField(1, StringValue.class).getValue(); // tokenize the plaintext List<List<HasWord>> tokenized = MaxentTagger.tokenizeText(new StringReader(plaintext)); Integer position = -1; for (List<HasWord> tokens : tokenized) { position += 1; // for each detected sentence PactSentence sentence = new PactSentence(); // populate a wordlist/sentence for (TaggedWord word : tagger.tagSentence(tokens)) sentence.add(new PactWord(word)); // postprocess the sentence sentence = SentenceUtils.joinByTagPattern(sentence, "\" * \"", "ENTITY"); sentence = SentenceUtils.replaceAllByTag(sentence, "ENTITY", "[\"]", ""); sentence = SentenceUtils.replaceAllByTag(sentence, "ENTITY", "^ | $", ""); sentence = SentenceUtils.replaceAllByPattern(sentence, "MATH[0-9A-F]+", "FORMULA"); // emit the final sentence target.setField(1, sentence); target.setField(2, new DoubleValue((double) position / (double) tokenized.size())); collector.collect(target); } }
From source file:corpusProcessor.TokenCorpusXmlBuilder.java
License:Open Source License
public void characters(char[] buf, int offset, int len) throws SAXException { posTokenBuffer = new StringBuffer(); String s = new String(buf, offset, len); if (textBuffer == null) { textBuffer = new StringBuffer(s); } else {//from w w w. ja v a 2 s. c o m textBuffer.append(s); } String model = "/project/nlp/dingcheng/nlplab/models/bidirectional-wsj-0-18.tagger"; try { MaxentTagger tagger = new MaxentTagger(model); List<Sentence<? extends HasWord>> sentences = MaxentTagger .tokenizeText(new StringReader(textBuffer.toString())); for (Sentence<? extends HasWord> sentence : sentences) { Sentence<TaggedWord> tSentence = MaxentTagger.tagSentence(sentence); //out.append(tSentence.toString(false)); String[] tokenPosArray = tSentence.toString(false).split(" "); //for(int i=0;i<tokenPosArray.length;i++) for (String tokenPosStr : tokenPosArray) { String[] tokenPosPair = tokenPosStr.split("/"); posTokenBuffer.append("<TOKEN id=\"" + tokenCount + "\" pos=\"" + tokenPosPair[1] + "\">" + tokenPosPair[0] + "</TOKEN>"); posTokenBuffer.append("\n"); //emit("<TOKEN id=\""+tokenCount); tokenCount++; } //posTokenBuffer.append(tSentence.toString(false)); //System.out.println(tSentence.toString(false)); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:edu.cmu.geolocator.nlp.ner.FeatureExtractor.ACE_En_FeatureGenerator.java
License:Apache License
/** * MAIN FUNCTION FOR EXTRACTIN FEATURES/*from w w w . j ava 2 s.co m*/ * * @param t_tweet * @param trie * @param postags * @return FEATURE LISTS * @throws IOException */ public ArrayList<ArrayList<Feature>> extractFeature(Sentence sent) throws IOException { int len = sent.tokenLength(); ArrayList<ArrayList<Feature>> instances = new ArrayList<ArrayList<Feature>>(len); ArrayList<Feature> f = new ArrayList<Feature>(); Token[] originalTokens = sent.getTokens(); // lemmatize norm field. store in lemma field.// originally lemmat_tweet; // stored in lemma field //lemmatizer.lemmatize(sent); // pos tagging, originally postags. input is t_tweet // stored in pos field Reader r = new StringReader(sent.getSentenceString()); List<List<HasWord>> POSsentences = MaxentTagger.tokenizeText(r); for (List<HasWord> POSsentence : POSsentences) { List<TaggedWord> tSentence = tagger.tagSentence(POSsentence); if (tSentence != null) { for (int i = 0; i < sent.getTokens().length && i < tSentence.size(); i++) { sent.getTokens()[i].setPOS(tSentence.get(i).toString()); } } //System.out.println(edu.stanford.nlp.ling.Sentence.listToString(tSentence, false)); } // normalize tweet norm_tweet for (int i = 0; i < len; i++) sent.getTokens()[i] .setNorm(StringUtil.getDeAccentLoweredString(tokentype(sent.getTokens()[i].getToken()))); // String[] f_pos = postags.toArray(new String[] {}); // f_gaz originally. filled in inGaz Field in token. check norm_tweet // field. // boolean[] f_gaz = // gazTag(tweetSentence, this.index); // use norm_tweet field to tag countries. don't remove f_country, // because it's not a type in // token. // boolean[] f_country = countryTag(tweetSentence); for (int i = 0; i < len; i++) { // clear feature list for this loop f = new ArrayList<Feature>(); // /////////////////////////////// MORPH FEATURES // use lemma_tweet to get token features. genTokenFeatures(f, sent, i); // ////////////////////////////// SEMANTIC FEATURES genPosFeatures(f, sent, i); //genTagFeatures(f, originalTokens, i); genWordShapeFeatures(f, originalTokens, i); //genBrownClusterFeatures(f, originalTokens, i); // ////////////////////////////////// GAZ AND DICT LOOK UP // genGazFeatures(f, sent, i); // f7: STREET SUFFIX // f8 PREPOSITION // genCountryFeatures(f, f_country, i); // f10 directions // FEATURES are not stored in tweetsentence in advance. Those are // generated in those features. // use t_tweet to get cap. genCapFeatures(f, sent, i); // use t_tweet to generate preposition tags. genPrepFeatures(f, sent, i, preposition); // genSuffixFeatures(f, sent, i); // f9: COUNTRY // f11: DISTANCE // f12: STOPWORDS // f13: BUILDING instances.add(f); //System.out.println(f); } return instances; }
From source file:edu.cmu.geolocator.nlp.ner.FeatureExtractor.SkipChainFeature.java
License:Apache License
/** * MAIN FUNCTION FOR EXTRACTIN FEATURES/*from w w w . j a v a 2 s .co m*/ * * @param t_tweet * @param trie * @param postags * @return FEATURE LISTS * @throws IOException */ public ArrayList<ArrayList<Feature>> extractFeature(Sentence sent) throws IOException { int len = sent.tokenLength(); ArrayList<ArrayList<Feature>> instances = new ArrayList<ArrayList<Feature>>(len); ArrayList<Feature> f = new ArrayList<Feature>(); Token[] originalTokens = sent.getTokens(); // lemmatize norm field. store in lemma field.// originally lemmat_tweet; // stored in lemma field //lemmatizer.lemmatize(sent); // pos tagging, originally postags. input is t_tweet // stored in pos field Reader r = new StringReader(sent.getSentenceString()); List<List<HasWord>> POSsentences = MaxentTagger.tokenizeText(r); for (List<HasWord> POSsentence : POSsentences) { List<TaggedWord> tSentence = tagger.tagSentence(POSsentence); if (tSentence != null) { for (int i = 0; i < sent.getTokens().length && i < tSentence.size(); i++) { sent.getTokens()[i].setPOS(tSentence.get(i).toString()); } } //System.out.println(edu.stanford.nlp.ling.Sentence.listToString(tSentence, false)); } // normalize tweet norm_tweet for (int i = 0; i < len; i++) sent.getTokens()[i] .setNorm(StringUtil.getDeAccentLoweredString(tokentype(sent.getTokens()[i].getToken()))); // String[] f_pos = postags.toArray(new String[] {}); // f_gaz originally. filled in inGaz Field in token. check norm_tweet // field. // boolean[] f_gaz = // gazTag(tweetSentence, this.index); // use norm_tweet field to tag countries. don't remove f_country, // because it's not a type in // token. // boolean[] f_country = countryTag(tweetSentence); for (int i = 0; i < len; i++) { // clear feature list for this loop f = new ArrayList<Feature>(); // /////////////////////////////// MORPH FEATURES // use lemma_tweet to get token features. genTokenFeatures(f, sent, i); // ////////////////////////////// SEMANTIC FEATURES genPosFeatures(f, sent, i); //genTagFeatures(f, originalTokens, i); genWordShapeFeatures(f, originalTokens, i); // ////////////////////////////////// GAZ AND DICT LOOK UP // genGazFeatures(f, sent, i); // f7: STREET SUFFIX // f8 PREPOSITION // genCountryFeatures(f, f_country, i); // f10 directions // FEATURES are not stored in tweetsentence in advance. Those are // generated in those features. // use t_tweet to get cap. genCapFeatures(f, sent, i); // use t_tweet to generate preposition tags. genPrepFeatures(f, sent, i, preposition); // genSuffixFeatures(f, sent, i); // f9: COUNTRY // f11: DISTANCE // f12: STOPWORDS // f13: BUILDING instances.add(f); //System.out.println(f); } return instances; }
From source file:final_dissertation.POStag.java
public static void TextToXml(MaxentTagger mt, String input, String output, HashMap<Integer, String> sentences) throws IOException { String str;//from ww w . java2s .c o m File fc = new File("E:\\Disertation\\Reviews\\Texts\\" + input); FileWriter file; file = new FileWriter("E:\\Disertation\\Reviews\\XMLs\\" + output); try (FileReader fl = new FileReader(fc)) { BufferedReader bf = new BufferedReader(fl); StringReader reader; int sentenceNum = 0; file.write("<"); file.write(XMLUtils.escapeElementXML("Text")); file.write(">"); while ((str = bf.readLine()) != null) { sentences.put(sentenceNum, str); reader = new StringReader(str); for (List sentence : MaxentTagger.tokenizeText(reader)) { ArrayList<TaggedWord> taggedSentence = mt.tagSentence(sentence); file.write(getXMLWords(taggedSentence, sentenceNum)); sentenceNum++; } } file.write(System.lineSeparator()); file.write("<"); file.write(XMLUtils.escapeElementXML("/Text")); file.write(">"); file.close(); } }
From source file:flight_ranker.TaggerDemo.java
public static void main(String[] args) throws Exception { // if (args.length != 2) { // System.err.println("usage: java TaggerDemo modelFile fileToTag"); // return; // }// w ww. j a v a2 s . c o m MaxentTagger tagger = new MaxentTagger("taggers\\english-left3words-distsim.tagger"); List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader("G:\\t.txt"))); for (List<HasWord> sentence : sentences) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); System.out.println(Sentence.listToString(tSentence, false)); } }
From source file:it.cnr.jatecs.nlp.utils.StanfordPOSTagger.java
License:Open Source License
public Vector<ArrayList<TaggedWord>> tag(String input) { Vector<ArrayList<TaggedWord>> returnVector = new Vector<ArrayList<TaggedWord>>(); List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new StringReader(input))); for (List<? extends HasWord> sentence : sentences) { returnVector.add(tagger.tagSentence(sentence)); }/*from ww w.j av a 2 s .co m*/ return returnVector; }
From source file:jp.xii.fn.jaen.JAENTokenizer.java
License:Apache License
/** * tokenize English string//from w w w . j a v a 2 s.c o m * @param str * @return token list */ public List<JAENToken> tokenizeEN(String str) { List<JAENToken> results = new ArrayList<JAENToken>(); for (List<HasWord> sentence : MaxentTagger.tokenizeText(new StringReader(str))) { for (TaggedWord tw : tagger.tagSentence(sentence)) { results.add(new JAENToken(tw.word().replaceAll("\\\\/", "/"), tw.tag())); } } return results; }
From source file:net.sourceforge.doddle_owl.ui.InputDocumentSelectionPanel.java
License:Open Source License
private String runStanfordParser(File docFile) { File dir = new File(STANFORD_PARSER_MODELS_HOME); if (!dir.exists()) { dir.mkdir();/*from ww w . j a v a2s. c o m*/ } BufferedWriter bw = null; StringBuilder builder = new StringBuilder(); try { String modelName = "english-left3words-distsim.tagger"; String modelPath = STANFORD_PARSER_MODELS_HOME + File.separator + modelName; File modelFile = new File(modelPath); if (!modelFile.exists()) { URL url = DODDLE_OWL.class.getClassLoader() .getResource(Utils.RESOURCE_DIR + "stanford_parser_models/" + modelName); if (url != null) { FileUtils.copyURLToFile(url, modelFile); // System.out.println("copy: " + // modelFile.getAbsolutePath()); } } bw = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(STANFORD_PARSER_MODELS_HOME + File.separator + "tmpTagger.txt"), "UTF-8")); MaxentTagger tagger = new MaxentTagger(modelFile.getAbsolutePath()); List<List<HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(docFile))); for (List<HasWord> sentence : sentences) { List<TaggedWord> tSentence = tagger.tagSentence(sentence); bw.write(Sentence.listToString(tSentence, false)); builder.append(Sentence.listToString(tSentence, false)); } bw.close(); } catch (IOException ioe) { DODDLE_OWL.getLogger().log(Level.DEBUG, "Stanford Parser can not be executed."); } catch (Exception e) { e.printStackTrace(); } finally { try { if (bw != null) { bw.close(); } } catch (IOException ioe2) { ioe2.printStackTrace(); } } return builder.toString(); }