List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:de.uni_koeln.spinfo.maalr.lucene.util.TokenizerHelper.java
License:Apache License
public static String tokenizeString(Analyzer analyzer, String string) { // Inspired by stackoverflow: // http://stackoverflow.com/questions/6334692/how-to-use-a-lucene-analyzer-to-tokenize-a-string StringBuilder builder = new StringBuilder(); try {/*from ww w . j a va 2 s .co m*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { builder.append(stream.getAttribute(CharTermAttribute.class).toString()); builder.append(" "); } stream.close(); } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return builder.toString().trim(); }
From source file:dependencies.ReviewDependencyAnalyzer.java
License:Open Source License
public ArrayList<ArrayList<Token>> getSentences(Reader reader) { try {// ww w . j av a 2 s . c om // Send reader data through the analyzer TokenStream tokstr = reusableTokenStream("", reader); TermAttribute tok_term = tokstr.getAttribute(TermAttribute.class); TypeAttribute tok_type = tokstr.getAttribute(TypeAttribute.class); FlagsAttribute tok_flags = tokstr.getAttribute(FlagsAttribute.class); PayloadAttribute tok_payload = tokstr.getAttribute(PayloadAttribute.class); // Split the tokenstream returned by the analyzer into sentences. Convert each sentence // into a linked list of tokens ArrayList<ArrayList<Token>> sentence_list = new ArrayList<ArrayList<Token>>(); ArrayList<Token> current_sentence = new ArrayList<Token>(); while (tokstr.incrementToken()) { Token current_token = new Token(tok_term.term(), tok_type.type(), tok_flags.getFlags(), new ReviewTermPayload(tok_payload.getPayload())); current_sentence.add(current_token); // End of sentence reached. Add current sentence to the sentence list if (current_token.isDelim(true)) { if (current_sentence.size() > 1) { sentence_list.add(current_sentence); } current_sentence = new ArrayList<Token>(); } } // At the end of the token stream, if there is an incomplete sentence, add it to the // sentence list. // This case could occur when the last sentence of a given passage does not end with a // period or other sentence delimiter. if (!current_sentence.isEmpty()) { sentence_list.add(current_sentence); } return sentence_list; } catch (IOException e) { AppLogger.error.log(Level.SEVERE, "Error reading data from reader. Analyzing text for typed dependencies could not be completed"); return null; } }
From source file:di.uniba.it.wsd.RevisedLesk.java
License:Open Source License
/** * * @param text// w w w. j a va 2 s.c om * @return * @throws IOException */ public Map<String, Float> buildBag(String text) throws IOException { Map<String, Float> bag = new HashMap<>(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); SnowballStemmer stemmer = null; if (stemming) { stemmer = getStemmer(language); if (stemmer == null) { Logger.getLogger(RevisedLesk.class.getName()).log(Level.WARNING, "No stemmer for language {0}", language); } } TokenStream tokenStream = analyzer.tokenStream("gloss", new StringReader(text)); while (tokenStream.incrementToken()) { TermAttribute token = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); String term = token.term(); if (stemmer != null) { stemmer.setCurrent(term); if (stemmer.stem()) { term = stemmer.getCurrent(); } } Float c = bag.get(term); if (c == null) { bag.put(term, 1f); } else { bag.put(term, c + 1f); } } return bag; }
From source file:Document.DocumentProcessor.java
public final void processDocument(Document doc) { try {// w w w . j a v a2 s.c o m CharArraySet ch = new CharArraySet(Version.LUCENE_48, stopWords, true); TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_48, new StringReader(doc.getContent())); tokenStream = new StopFilter(Version.LUCENE_36, tokenStream, ch); tokenStream = new PorterStemFilter(tokenStream); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); Set<String> uniqueWords = new HashSet<>(); Map<String, Integer> wordFrequency = new HashMap<String, Integer>(); tokenStream.reset(); while (tokenStream.incrementToken()) { String word = charTermAttribute.toString(); uniqueWords.add(word); if (wordFrequency.containsKey(word)) wordFrequency.put(word, wordFrequency.get(word) + 1); else wordFrequency.put(word, 1); dictionary.add(word); } doc.setUniqueWords(uniqueWords); doc.setWordFrequency(wordFrequency); } catch (IOException ex) { Logger.getLogger(DocumentProcessor.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java
License:Apache License
public static String preprocessStemAndTokenize(String data) { Set<String> transformedSet = new HashSet<String>(); //Set will make sure only unique terms are kept StringBuilder strBuilder = new StringBuilder(); Tokenizer analyzer = new Tokenizer(Version.LUCENE_30); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data)); TermAttribute termAttribute;// ww w.ja v a2s .c o m String term; //System.out.println("The value of data in tokenizeAndStem: "+ data); try { while (tokenStream.incrementToken()) { termAttribute = tokenStream.getAttribute(TermAttribute.class); term = termAttribute.term(); if (stopwords.contains(term)) { //ignore stopwords //System.out.println("Contains stopword: "+ term); continue; } if (digitPattern.matcher(term).find()) //ignore digits continue; if (term.length() <= 1) //ignore 1 letter words continue; if (!digitPattern.matcher(term).find()) { //ignore digits stemmer.setCurrent(term); stemmer.stem(); transformedSet.add(stemmer.getCurrent()); } } } catch (Exception e) { e.printStackTrace(); } //System.out.println("transormed set size in tokenizeAndStem: "+ transformedSet.size()); for (Object token : transformedSet.toArray()) { strBuilder.append(token).append(" "); } //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString()); return strBuilder.toString(); }
From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java
License:Apache License
public static String preprocessRemoveStopWords(String data) { StringBuilder strBuilder = new StringBuilder(); Tokenizer analyzer = new Tokenizer(Version.LUCENE_30); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data)); TermAttribute termAttribute;//from w ww . j a v a 2s. com String term; //System.out.println("The value of data in tokenizeAndStem: "+ data); try { while (tokenStream.incrementToken()) { termAttribute = tokenStream.getAttribute(TermAttribute.class); term = termAttribute.term(); if (digitPattern.matcher(term).find()) //ignore digits continue; if (term.length() <= 1) continue; if (stopwords.contains(term)) continue; strBuilder.append(term).append(" "); } } catch (Exception e) { e.printStackTrace(); } //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString()); return strBuilder.toString().trim(); }
From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java
License:Apache License
public static Set<String> preprocessStemAndTokenizeAddBigramsInSet(String data) { //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams .."); Set<String> transformedSet = new LinkedHashSet<String>(); List<String> stemmedList = new ArrayList<String>(); //System.out.println("Stop words length:" + stopwords.size()); Tokenizer analyzer = new Tokenizer(Version.LUCENE_30); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data)); TermAttribute termAttribute;// w w w . j av a 2 s . c om String term; try { while (tokenStream.incrementToken()) { termAttribute = tokenStream.getAttribute(TermAttribute.class); term = termAttribute.term(); if (digitPattern.matcher(term).find()) //ignore digits continue; if (stopwords.contains(term)) //ignore stopwords continue; if (term.length() <= 1) //ignore single letter words continue; stemmer.setCurrent(term); stemmer.stem(); stemmedList.add(stemmer.getCurrent()); } } catch (Exception e) { e.printStackTrace(); } String[] ds = stemmedList.toArray(new String[0]); /*for(int i=0; i<stemmedList.size(); i++) System.out.print(ds[i]+"\t");*/ //add bi-grams final int size = 2; for (int i = 0; i < ds.length; i++) { transformedSet.add(ds[i]); //add single words if (i + size <= ds.length) { String t = ""; for (int j = i; j < i + size; j++) { t += " " + ds[j]; } t = t.trim().replaceAll("\\s+", "_"); transformedSet.add(t); //add bi-gram combined with "_" } } //System.out.println(" ") stemmedList.clear(); stemmedList = null; ds = null; return transformedSet; }
From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java
License:Apache License
public static String preprocessStemAndTokenizeReturnDistinctTokens(String data) { //System.out.println("Preprocess data, remove stop words, stem, tokenize .."); Set<String> transformedSet = new LinkedHashSet<String>(); List<String> stemmedList = new ArrayList<String>(); Tokenizer analyzer = new Tokenizer(Version.LUCENE_30); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data)); TermAttribute termAttribute;/*from w w w . jav a 2s. co m*/ String term; try { while (tokenStream.incrementToken()) { termAttribute = tokenStream.getAttribute(TermAttribute.class); term = termAttribute.term(); if (digitPattern.matcher(term).find()) //ignore digits continue; if (stopwords.contains(term)) //ignore stopwords continue; if (term.length() <= 1) //ignore single letter words continue; stemmer.setCurrent(term); stemmer.stem(); stemmedList.add(stemmer.getCurrent()); } transformedSet.addAll(stemmedList); } catch (Exception e) { e.printStackTrace(); } stemmedList.clear(); stemmedList = null; return StringUtils.join(transformedSet.toArray(), " "); }
From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java
License:Apache License
public static String preprocessStemAndTokenizeAddBigramsInString(String data) { //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams .."); Set<String> transformedSet = new LinkedHashSet<String>(); List<String> stemmedList = new ArrayList<String>(); Tokenizer analyzer = new Tokenizer(Version.LUCENE_30); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data)); TermAttribute termAttribute;//from www . jav a2 s.com String term; try { while (tokenStream.incrementToken()) { termAttribute = tokenStream.getAttribute(TermAttribute.class); term = termAttribute.term(); if (digitPattern.matcher(term).find()) //ignore digits continue; if (stopwords.contains(term)) //ignore stopwords continue; if (term.length() <= 1) //ignore stopwords continue; stemmer.setCurrent(term); stemmer.stem(); stemmedList.add(stemmer.getCurrent()); } } catch (Exception e) { e.printStackTrace(); } String[] ds = stemmedList.toArray(new String[0]); /*for(int i=0; i<stemmedList.size(); i++) System.out.print(ds[i]+"\t");*/ //add bi-grams final int size = 2; for (int i = 0; i < ds.length; i++) { transformedSet.add(ds[i]); //add single words if (i + size <= ds.length) { String t = ""; for (int j = i; j < i + size; j++) { t += " " + ds[j]; } t = t.trim().replaceAll("\\s+", "_"); transformedSet.add(t); //add bi-gram combined with "_" } } //System.out.println(transformedSet.toArray(new String[transformedSet.size()]).toString()); return StringUtils.join(transformedSet.toArray(new String[transformedSet.size()]), " "); }
From source file:edu.sdsc.scigraph.annotation.ShingleProducer.java
License:Apache License
@Override public void run() { Deque<Token<String>> buffer = new LinkedList<>(); try {/* w ww.ja va 2 s. c om*/ TokenStream stream = analyzer.tokenStream("", reader); OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class); CharTermAttribute term = stream.getAttribute(CharTermAttribute.class); try { while (stream.incrementToken()) { Token<String> token = new Token<String>(term.toString(), offset.startOffset(), offset.endOffset()); buffer.offer(token); if (buffer.size() < shingleCount) { // Fill the buffer first, before offering anything to the queue continue; } addBufferToQueue(buffer); if (shingleCount == buffer.size()) { buffer.pop(); } } } catch (IOException e) { logger.log(Level.WARNING, "Failed to produces singles", e); } while (!buffer.isEmpty()) { addBufferToQueue(buffer); buffer.pop(); } queue.put(END_TOKEN); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } }