List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:edu.mit.ll.vizlinc.highlight.Highlighter.java
License:Apache License
/** * Low level api to get the most relevant (formatted) sections of the document. * This method has been made public to allow visibility of score information held in TextFragment objects. * Thanks to Jason Calabrese for help in redefining the interface. * @param tokenStream//from w w w . j a va 2 s . c o m * @param text * @param maxNumFragments * @param mergeContiguousFragments * @throws IOException * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length */ public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException { ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>(); StringBuilder newText = new StringBuilder(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); if (fragmentScorer instanceof QueryScorer) { ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); } TokenStream newStream = fragmentScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; textFragmenter.start(text, tokenStream); TokenGroup tokenGroup = new TokenGroup(tokenStream); for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) { if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) { throw new InvalidTokenOffsetsException("Token " + termAtt.toString() + " exceeds length of provided text sized " + text.length()); } if ((tokenGroup.numTokens > 0) && (tokenGroup.isDistinct())) { //the current token is distinct from previous tokens - // markup the cached token group info startOffset = tokenGroup.matchStartOffset; endOffset = tokenGroup.matchEndOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); newText.append(markedUpText); lastEndOffset = Math.max(endOffset, lastEndOffset); tokenGroup.clear(); //check if current token marks the start of a new fragment if (textFragmenter.isNewFragment()) { currentFrag.setScore(fragmentScorer.getFragmentScore()); //record stats for a new fragment currentFrag.textEndPos = newText.length(); currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); } } tokenGroup.addToken(fragmentScorer.getTokenScore()); // if(lastEndOffset>maxDocBytesToAnalyze) // { // break; // } } currentFrag.setScore(fragmentScorer.getFragmentScore()); if (tokenGroup.numTokens > 0) { //flush the accumulated text (same code as in above loop) startOffset = tokenGroup.matchStartOffset; endOffset = tokenGroup.matchEndOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); newText.append(markedUpText); lastEndOffset = Math.max(lastEndOffset, endOffset); } //Test what remains of the original text beyond the point where we stopped analyzing if ( // if there is text beyond the last token considered.. (lastEndOffset < text.length()) && // and that text is not too large... (text.length() <= maxDocCharsToAnalyze)) { //append it to the last fragment newText.append(encoder.encodeText(text.substring(lastEndOffset))); } currentFrag.textEndPos = newText.length(); //sort the most relevant sections of the text for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) { currentFrag = i.next(); //If you are running with a version of Lucene before 11th Sept 03 // you do not have PriorityQueue.insert() - so uncomment the code below /* if (currentFrag.getScore() >= minScore) { fragQueue.put(currentFrag); if (fragQueue.size() > maxNumFragments) { // if hit queue overfull fragQueue.pop(); // remove lowest in hit queue minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore } } */ //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 //fix to PriorityQueue. The correct method to use here is the new "insert" method // USE ABOVE CODE IF THIS DOES NOT COMPILE! fragQueue.insertWithOverflow(currentFrag); } //return the most relevant fragments TextFragment frag[] = new TextFragment[fragQueue.size()]; for (int i = frag.length - 1; i >= 0; i--) { frag[i] = fragQueue.pop(); } //merge any contiguous fragments to improve readability if (mergeContiguousFragments) { mergeContiguousFragments(frag); ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>(); for (int i = 0; i < frag.length; i++) { if ((frag[i] != null) && (frag[i].getScore() > 0)) { fragTexts.add(frag[i]); } } frag = fragTexts.toArray(new TextFragment[0]); } return frag; } finally { if (tokenStream != null) { try { tokenStream.end(); tokenStream.close(); } catch (Exception e) { } } } }
From source file:edu.sdsc.scigraph.lucene.LuceneUtils.java
License:Apache License
public static List<String> getTokenization(Analyzer analyzer, CharSequence term) { List<String> ret = Lists.newArrayList(); try {/* w w w.j av a2s .c o m*/ TokenStream stream = analyzer.tokenStream("", new StringReader(term.toString())); CharTermAttribute token = stream.getAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { ret.add(token.toString()); } } catch (IOException e) { e.printStackTrace(); } return ret; }
From source file:edu.stanford.rad.naivebayes.ClassifyLines.java
License:Apache License
public static void main(String[] args) throws Exception { // if (args.length < 5) { // System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); // return; // }// w ww. j a v a 2s . c o m // String modelPath = args[0]; // String labelIndexPath = args[1]; // String dictionaryPath = args[2]; // String documentFrequencyPath = args[3]; // String tweetsPath = args[4]; String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb"; String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex"; String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0"; String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000"; String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt"; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Fixed error : close ts:TokenStream ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }
From source file:edu.upenn.library.solrplugins.CaseInsensitiveSortingTextField.java
License:Apache License
@Override public BytesRef normalizeQueryTarget(String val, boolean strict, String fieldName, boolean appendExtraDelim) throws IOException { TokenStream ts = getQueryAnalyzer().tokenStream(fieldName, val); try {//from w w w. j av a2 s. co m ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class); String matchType = strict ? INDEXED_TOKEN_TYPE : NORMALIZED_TOKEN_TYPE; while (ts.incrementToken()) { if (matchType.equals(typeAtt.type())) { BytesRefBuilder ret = new BytesRefBuilder(); ret.copyChars(termAtt.toString()); if (!strict || appendExtraDelim) { ret.append(delimBytes, 0, delimBytes.length); } return ret.get(); } } return new BytesRef(BytesRef.EMPTY_BYTES); } finally { ts.close(); } }
From source file:edu.utsa.sifter.DocMaker.java
License:Apache License
public static boolean addBodyField(final Document doc, final String body, final Analyzer analyzer, boolean testEmpty) throws IOException { final Field f = new Field("body", body, BodyOptions); if (testEmpty) { // System.out.println("testing if doc has empty body"); final TokenStream toks = f.tokenStream(analyzer); toks.reset(); if (!toks.incrementToken()) { // System.out.println("empty body, won't index"); toks.close();// w ww . j ava2 s .co m return false; } } doc.add(new Field("body", body, BodyOptions)); doc.add(new LongField("body-len", body.length(), Field.Store.YES)); return true; }
From source file:edu.virginia.cs.utility.StringTokenizer.java
/** * Method that generates list of tokens from the parameter string. * * @param string/*from w ww .java 2 s .c o m*/ * @return list of tokens generated */ public List<String> TokenizeString(String string) { List<String> result = new ArrayList<>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:elhuyar.bilakit.PayloadQParserPlugin.java
License:Open Source License
@Override protected Query getFieldQuery(String field, String queryText, boolean quoted) throws SyntaxError { SchemaField sf = this.schema.getFieldOrNull(field); if (!quoted && sf != null && sf.getType().getTypeName().endsWith("_payloads")) { //analyze queryText List<String> result = new ArrayList<String>(); try {/* w w w. j a v a 2 s.c om*/ TokenStream stream = getAnalyzer().tokenStream(field, new StringReader(queryText)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } String analyzedqueryText = ""; analyzedqueryText = result.toString().replaceAll("\\[|\\]", "").replaceAll(", ", " "); queryText = analyzedqueryText; // Note that this will work for any field defined with the // <fieldType> of "*_payloads" Query plter = new PayloadTermQuery(new Term(field, queryText), new AveragePayloadFunction(), true); return plter; } return super.getFieldQuery(field, queryText, quoted); }
From source file:engine.easy.indexer.writer.EasySearchIndexWriter.java
License:Apache License
/** * Count the token stream tokens.//from w w w . j ava 2s . c om * * @return it returns the no:of stream tokens. * @throws IOException if the file would have any IO operation. */ private static int[] countTokenStream(TokenStream tokenStream) throws IOException { int v[] = new int[2]; HashSet countTokenStreamBuffer = new HashSet(); TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { v[0]++; countTokenStreamBuffer.add(new String(termAtt.termBuffer(), 0, termAtt.termLength())); } v[1] = countTokenStreamBuffer.size(); tokenStream.reset(); countTokenStreamBuffer.clear(); return v; }
From source file:fry.future.plugin.example.APP.java
private static List<String> tokenString(Analyzer analyzer, String str) throws IOException { List<String> result = new ArrayList<>(); TokenStream tokenStream = analyzer.tokenStream("Test", new StringReader(str)); tokenStream.reset(); while (tokenStream.incrementToken()) { result.add(tokenStream.getAttribute(CharTermAttribute.class).toString()); }/*from w w w. j a v a2 s. co m*/ return result; }
From source file:gr.aueb.demo.PropertyRegistryBean.java
public static String removeStopWords(String textFile) { //CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet(); CharArraySet stopWords = PropertyRegistryBean.stopSet; TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_48, new StringReader(textFile.trim())); tokenStream = new StopFilter(Version.LUCENE_48, tokenStream, stopWords); StringBuilder sb = new StringBuilder(); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try {/*from w ww . jav a 2 s . c o m*/ tokenStream.reset(); } catch (IOException ex) { Logger.getLogger(PropertyRegistryBean.class.getName()).log(Level.SEVERE, null, ex); } try { while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); sb.append(term + " "); } } catch (IOException ex) { Logger.getLogger(PropertyRegistryBean.class.getName()).log(Level.SEVERE, null, ex); } return sb.toString(); }