List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java
License:Apache License
public static String preprocessStemAndTokenize(String data) { Set<String> transformedSet = new HashSet<String>(); //Set will make sure only unique terms are kept StringBuilder strBuilder = new StringBuilder(); Tokenizer analyzer = new Tokenizer(Version.LUCENE_30); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data)); TermAttribute termAttribute;/*from w w w.j av a2 s .com*/ String term; //System.out.println("The value of data in tokenizeAndStem: "+ data); try { while (tokenStream.incrementToken()) { termAttribute = tokenStream.getAttribute(TermAttribute.class); term = termAttribute.term(); if (stopwords.contains(term)) { //ignore stopwords //System.out.println("Contains stopword: "+ term); continue; } if (digitPattern.matcher(term).find()) //ignore digits continue; if (term.length() <= 1) //ignore 1 letter words continue; if (!digitPattern.matcher(term).find()) { //ignore digits stemmer.setCurrent(term); stemmer.stem(); transformedSet.add(stemmer.getCurrent()); } } } catch (Exception e) { e.printStackTrace(); } //System.out.println("transormed set size in tokenizeAndStem: "+ transformedSet.size()); for (Object token : transformedSet.toArray()) { strBuilder.append(token).append(" "); } //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString()); return strBuilder.toString(); }
From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java
License:Apache License
public static String preprocessRemoveStopWords(String data) { StringBuilder strBuilder = new StringBuilder(); Tokenizer analyzer = new Tokenizer(Version.LUCENE_30); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data)); TermAttribute termAttribute;/*from w w w. ja v a2 s . c o m*/ String term; //System.out.println("The value of data in tokenizeAndStem: "+ data); try { while (tokenStream.incrementToken()) { termAttribute = tokenStream.getAttribute(TermAttribute.class); term = termAttribute.term(); if (digitPattern.matcher(term).find()) //ignore digits continue; if (term.length() <= 1) continue; if (stopwords.contains(term)) continue; strBuilder.append(term).append(" "); } } catch (Exception e) { e.printStackTrace(); } //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString()); return strBuilder.toString().trim(); }
From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java
License:Apache License
public static Set<String> preprocessStemAndTokenizeAddBigramsInSet(String data) { //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams .."); Set<String> transformedSet = new LinkedHashSet<String>(); List<String> stemmedList = new ArrayList<String>(); //System.out.println("Stop words length:" + stopwords.size()); Tokenizer analyzer = new Tokenizer(Version.LUCENE_30); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data)); TermAttribute termAttribute;//from ww w . j a va 2s. co m String term; try { while (tokenStream.incrementToken()) { termAttribute = tokenStream.getAttribute(TermAttribute.class); term = termAttribute.term(); if (digitPattern.matcher(term).find()) //ignore digits continue; if (stopwords.contains(term)) //ignore stopwords continue; if (term.length() <= 1) //ignore single letter words continue; stemmer.setCurrent(term); stemmer.stem(); stemmedList.add(stemmer.getCurrent()); } } catch (Exception e) { e.printStackTrace(); } String[] ds = stemmedList.toArray(new String[0]); /*for(int i=0; i<stemmedList.size(); i++) System.out.print(ds[i]+"\t");*/ //add bi-grams final int size = 2; for (int i = 0; i < ds.length; i++) { transformedSet.add(ds[i]); //add single words if (i + size <= ds.length) { String t = ""; for (int j = i; j < i + size; j++) { t += " " + ds[j]; } t = t.trim().replaceAll("\\s+", "_"); transformedSet.add(t); //add bi-gram combined with "_" } } //System.out.println(" ") stemmedList.clear(); stemmedList = null; ds = null; return transformedSet; }
From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java
License:Apache License
public static String preprocessStemAndTokenizeReturnDistinctTokens(String data) { //System.out.println("Preprocess data, remove stop words, stem, tokenize .."); Set<String> transformedSet = new LinkedHashSet<String>(); List<String> stemmedList = new ArrayList<String>(); Tokenizer analyzer = new Tokenizer(Version.LUCENE_30); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data)); TermAttribute termAttribute;//w ww . j ava 2 s . co m String term; try { while (tokenStream.incrementToken()) { termAttribute = tokenStream.getAttribute(TermAttribute.class); term = termAttribute.term(); if (digitPattern.matcher(term).find()) //ignore digits continue; if (stopwords.contains(term)) //ignore stopwords continue; if (term.length() <= 1) //ignore single letter words continue; stemmer.setCurrent(term); stemmer.stem(); stemmedList.add(stemmer.getCurrent()); } transformedSet.addAll(stemmedList); } catch (Exception e) { e.printStackTrace(); } stemmedList.clear(); stemmedList = null; return StringUtils.join(transformedSet.toArray(), " "); }
From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java
License:Apache License
public static String preprocessStemAndTokenizeAddBigramsInString(String data) { //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams .."); Set<String> transformedSet = new LinkedHashSet<String>(); List<String> stemmedList = new ArrayList<String>(); Tokenizer analyzer = new Tokenizer(Version.LUCENE_30); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data)); TermAttribute termAttribute;/*from ww w . j a va2s .c o m*/ String term; try { while (tokenStream.incrementToken()) { termAttribute = tokenStream.getAttribute(TermAttribute.class); term = termAttribute.term(); if (digitPattern.matcher(term).find()) //ignore digits continue; if (stopwords.contains(term)) //ignore stopwords continue; if (term.length() <= 1) //ignore stopwords continue; stemmer.setCurrent(term); stemmer.stem(); stemmedList.add(stemmer.getCurrent()); } } catch (Exception e) { e.printStackTrace(); } String[] ds = stemmedList.toArray(new String[0]); /*for(int i=0; i<stemmedList.size(); i++) System.out.print(ds[i]+"\t");*/ //add bi-grams final int size = 2; for (int i = 0; i < ds.length; i++) { transformedSet.add(ds[i]); //add single words if (i + size <= ds.length) { String t = ""; for (int j = i; j < i + size; j++) { t += " " + ds[j]; } t = t.trim().replaceAll("\\s+", "_"); transformedSet.add(t); //add bi-gram combined with "_" } } //System.out.println(transformedSet.toArray(new String[transformedSet.size()]).toString()); return StringUtils.join(transformedSet.toArray(new String[transformedSet.size()]), " "); }
From source file:edu.mit.ll.vizlinc.highlight.Highlighter.java
License:Apache License
/** * Low level api to get the most relevant (formatted) sections of the document. * This method has been made public to allow visibility of score information held in TextFragment objects. * Thanks to Jason Calabrese for help in redefining the interface. * @param tokenStream/* www . j av a 2s. c om*/ * @param text * @param maxNumFragments * @param mergeContiguousFragments * @throws IOException * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length */ public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException { ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>(); StringBuilder newText = new StringBuilder(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); if (fragmentScorer instanceof QueryScorer) { ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); } TokenStream newStream = fragmentScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; textFragmenter.start(text, tokenStream); TokenGroup tokenGroup = new TokenGroup(tokenStream); for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) { if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) { throw new InvalidTokenOffsetsException("Token " + termAtt.toString() + " exceeds length of provided text sized " + text.length()); } if ((tokenGroup.numTokens > 0) && (tokenGroup.isDistinct())) { //the current token is distinct from previous tokens - // markup the cached token group info startOffset = tokenGroup.matchStartOffset; endOffset = tokenGroup.matchEndOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); newText.append(markedUpText); lastEndOffset = Math.max(endOffset, lastEndOffset); tokenGroup.clear(); //check if current token marks the start of a new fragment if (textFragmenter.isNewFragment()) { currentFrag.setScore(fragmentScorer.getFragmentScore()); //record stats for a new fragment currentFrag.textEndPos = newText.length(); currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); } } tokenGroup.addToken(fragmentScorer.getTokenScore()); // if(lastEndOffset>maxDocBytesToAnalyze) // { // break; // } } currentFrag.setScore(fragmentScorer.getFragmentScore()); if (tokenGroup.numTokens > 0) { //flush the accumulated text (same code as in above loop) startOffset = tokenGroup.matchStartOffset; endOffset = tokenGroup.matchEndOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); newText.append(markedUpText); lastEndOffset = Math.max(lastEndOffset, endOffset); } //Test what remains of the original text beyond the point where we stopped analyzing if ( // if there is text beyond the last token considered.. (lastEndOffset < text.length()) && // and that text is not too large... (text.length() <= maxDocCharsToAnalyze)) { //append it to the last fragment newText.append(encoder.encodeText(text.substring(lastEndOffset))); } currentFrag.textEndPos = newText.length(); //sort the most relevant sections of the text for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) { currentFrag = i.next(); //If you are running with a version of Lucene before 11th Sept 03 // you do not have PriorityQueue.insert() - so uncomment the code below /* if (currentFrag.getScore() >= minScore) { fragQueue.put(currentFrag); if (fragQueue.size() > maxNumFragments) { // if hit queue overfull fragQueue.pop(); // remove lowest in hit queue minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore } } */ //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 //fix to PriorityQueue. The correct method to use here is the new "insert" method // USE ABOVE CODE IF THIS DOES NOT COMPILE! fragQueue.insertWithOverflow(currentFrag); } //return the most relevant fragments TextFragment frag[] = new TextFragment[fragQueue.size()]; for (int i = frag.length - 1; i >= 0; i--) { frag[i] = fragQueue.pop(); } //merge any contiguous fragments to improve readability if (mergeContiguousFragments) { mergeContiguousFragments(frag); ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>(); for (int i = 0; i < frag.length; i++) { if ((frag[i] != null) && (frag[i].getScore() > 0)) { fragTexts.add(frag[i]); } } frag = fragTexts.toArray(new TextFragment[0]); } return frag; } finally { if (tokenStream != null) { try { tokenStream.end(); tokenStream.close(); } catch (Exception e) { } } } }
From source file:edu.sdsc.scigraph.annotation.ShingleProducer.java
License:Apache License
@Override public void run() { Deque<Token<String>> buffer = new LinkedList<>(); try {/*from w ww . j ava2s .com*/ TokenStream stream = analyzer.tokenStream("", reader); OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class); CharTermAttribute term = stream.getAttribute(CharTermAttribute.class); try { while (stream.incrementToken()) { Token<String> token = new Token<String>(term.toString(), offset.startOffset(), offset.endOffset()); buffer.offer(token); if (buffer.size() < shingleCount) { // Fill the buffer first, before offering anything to the queue continue; } addBufferToQueue(buffer); if (shingleCount == buffer.size()) { buffer.pop(); } } } catch (IOException e) { logger.log(Level.WARNING, "Failed to produces singles", e); } while (!buffer.isEmpty()) { addBufferToQueue(buffer); buffer.pop(); } queue.put(END_TOKEN); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } }
From source file:edu.sdsc.scigraph.lucene.LuceneUtils.java
License:Apache License
public static List<String> getTokenization(Analyzer analyzer, CharSequence term) { List<String> ret = Lists.newArrayList(); try {//from ww w . j av a2 s . c om TokenStream stream = analyzer.tokenStream("", new StringReader(term.toString())); CharTermAttribute token = stream.getAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { ret.add(token.toString()); } } catch (IOException e) { e.printStackTrace(); } return ret; }
From source file:edu.stanford.lucene.analysis.TestCJKFoldingFilter.java
License:Open Source License
void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt) throws Exception { assertTrue(stream.incrementToken()); assertEquals(expected, termAtt.toString()); }
From source file:edu.stanford.rad.naivebayes.ClassifyLines.java
License:Apache License
public static void main(String[] args) throws Exception { // if (args.length < 5) { // System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); // return; // }/*from w ww . j a v a 2s .c om*/ // String modelPath = args[0]; // String labelIndexPath = args[1]; // String dictionaryPath = args[2]; // String documentFrequencyPath = args[3]; // String tweetsPath = args[4]; String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb"; String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex"; String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0"; String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000"; String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt"; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Fixed error : close ts:TokenStream ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }