List of usage examples for org.apache.lucene.analysis TokenStream close
@Override public void close() throws IOException
From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java
License:Open Source License
/** * We intentionally disable the positionIncrement because we want phrases to match across removed stop words. * * @see PositionIncrementAttribute/* w ww.j a v a2s. c o m*/ */ @Test public void positionIncrement() throws Exception { TokenStream stream = ZimbraAnalyzer.getInstance().tokenStream(LuceneFields.L_H_SUBJECT, new StringReader("It's a test.")); PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class); while (stream.incrementToken()) { Assert.assertEquals(posIncrAtt.getPositionIncrement(), 1); } stream.end(); stream.close(); }
From source file:CopulaResources.TermCooccurence.java
private static List tokenizeString(Analyzer analyzer, String str) { List result = new ArrayList<>(); try {//from ww w.j a v a 2 s . c o m TokenStream stream = analyzer.tokenStream(null, new StringReader(str)); stream.reset(); while (stream.incrementToken()) result.add(stream.getAttribute(CharTermAttribute.class).toString()); stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:de.mirkosertic.desktopsearch.SearchPhraseSuggester.java
License:Open Source License
private String analyze(String aFieldName, String aString) throws IOException { TokenStream theTokenStream = analyzer.tokenStream(aFieldName, aString); theTokenStream.reset();/*from ww w . j a v a 2s .c o m*/ CharTermAttribute theCharTerms = theTokenStream.getAttribute(CharTermAttribute.class); try { if (theTokenStream.incrementToken()) { return theCharTerms.toString(); } return null; } finally { theTokenStream.end(); theTokenStream.close(); } }
From source file:de.twitterlivesearch.analysis.Tokenizer.java
License:Apache License
/** * @param stringToAnalyze/*from www. j a v a 2 s .c o m*/ * String to be tokenized * @param {@link org.apache.lucene.analysis.Analyzer Analyzer} to be used * for analysis * * @return list of tokens */ public static List<String> getTokensForString(String stringToAnalyze, Analyzer analyzer) { List<String> tokens = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(stringToAnalyze)); stream.reset(); while (stream.incrementToken()) { tokens.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return tokens; }
From source file:de.uni_koeln.spinfo.maalr.lucene.util.TokenizerHelper.java
License:Apache License
public static String tokenizeString(Analyzer analyzer, String string) { // Inspired by stackoverflow: // http://stackoverflow.com/questions/6334692/how-to-use-a-lucene-analyzer-to-tokenize-a-string StringBuilder builder = new StringBuilder(); try {//from w w w . j a v a 2 s . com TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { builder.append(stream.getAttribute(CharTermAttribute.class).toString()); builder.append(" "); } stream.close(); } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return builder.toString().trim(); }
From source file:doc2vec.LuceneDocIterator.java
String preProcess(Analyzer analyzer, String text) throws Exception { StringBuffer tokenizedContentBuff = new StringBuffer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();// w w w .j av a 2s . c o m while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); if (labelsStoredWithWords) { term = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM)[0]; // the first part is the word } if (!term.trim().equals("")) tokenizedContentBuff.append(term).append(" "); } stream.end(); stream.close(); return tokenizedContentBuff.toString(); }
From source file:drakkar.mast.retrieval.analysis.NGramQuery.java
/** * * @param analyzer/*from w w w. j a va 2 s . c om*/ * @param queryTerm * @param field * @throws IOException */ public NGramQuery(Analyzer analyzer, String queryTerm, String field) throws IOException { String words[] = null; //remove white spaces if (queryTerm.contains(" ")) { words = queryTerm.split(" "); } else { words = new String[1]; words[0] = queryTerm; } //one term if (words.length > 1) { for (int i = 0; i < words.length; i++) { String string = words[i]; Term t = new Term(field, string); TermQuery pquery = new TermQuery(t); add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD); } } else { //more than one term for (int i = 0; i < words.length; i++) { String wordToAnalyze = words[i]; TokenStream tokens = analyzer.tokenStream(field, new StringReader(wordToAnalyze)); TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class); tokens.reset(); TermQuery pquery; for (; tokens.incrementToken(); add( new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) { Term t = new Term(field, termAtt.term()); pquery = new TermQuery(t); } tokens.end(); tokens.close(); } } }
From source file:drakkar.mast.retrieval.ngram.NGramQuery.java
/** * * @param a//from ww w. ja v a 2 s . c o m * @param queryTerm * @param field * @throws IOException */ public NGramQuery(Analyzer a, String queryTerm, String field) throws IOException { String words[] = null; if (queryTerm.contains(" ")) { words = queryTerm.split(" "); } else { words = new String[1]; words[0] = queryTerm; } if (words.length > 1) { for (int i = 0; i < words.length; i++) { String string = words[i]; Term t = new Term(field, string); TermQuery pquery = new TermQuery(t); add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD); } } else { for (int i = 0; i < words.length; i++) { String wordToAnalyze = words[i]; TokenStream tokens = a.tokenStream(field, new StringReader(wordToAnalyze)); TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class); tokens.reset(); TermQuery pquery; for (; tokens.incrementToken(); add( new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) { Term t = new Term(field, termAtt.term()); pquery = new TermQuery(t); } tokens.end(); tokens.close(); } } }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.TFIDF.java
License:Open Source License
protected void computeTFIDF(List<TFIDFTerm> wordList, int totalWordsDoc) { if (reader != null && searcher != null) { double tf; double idf; double tfidf; EnglishAnalyzer analyzer = new EnglishAnalyzer(Version.LUCENE_40); TokenStream stream = null; CharTermAttribute termAtt;//from ww w.j a v a2s .c o m String term; double totalWikiDocs = (double) reader.numDocs(); for (TFIDFTerm word : wordList) { try { term = ""; stream = analyzer.tokenStream("field", new StringReader(word.word)); termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { term += (termAtt.toString()); } // System.out.println(term); stream.end(); tf = (double) word.count / (double) totalWordsDoc; double wikiTermFrec = reader.docFreq(new Term("contents", term)); if (wikiTermFrec != 0) { idf = Math.log(totalWikiDocs / wikiTermFrec); tfidf = tf * idf; } else { tfidf = 0; } word.tfidf = tfidf; } catch (IOException ex) { logger.error("Error processing the TFIDF", ex); } finally { try { if (stream != null) { stream.close(); } } catch (IOException ex) { logger.error("Error processing the TFIDF", ex); } } } try { reader.close(); } catch (IOException ex) { logger.warn("Error closing lucene reader", ex); } } }
From source file:edu.mit.ll.vizlinc.highlight.Highlighter.java
License:Apache License
/** * Low level api to get the most relevant (formatted) sections of the document. * This method has been made public to allow visibility of score information held in TextFragment objects. * Thanks to Jason Calabrese for help in redefining the interface. * @param tokenStream//ww w . jav a2 s .com * @param text * @param maxNumFragments * @param mergeContiguousFragments * @throws IOException * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length */ public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException { ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>(); StringBuilder newText = new StringBuilder(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); if (fragmentScorer instanceof QueryScorer) { ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); } TokenStream newStream = fragmentScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; textFragmenter.start(text, tokenStream); TokenGroup tokenGroup = new TokenGroup(tokenStream); for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) { if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) { throw new InvalidTokenOffsetsException("Token " + termAtt.toString() + " exceeds length of provided text sized " + text.length()); } if ((tokenGroup.numTokens > 0) && (tokenGroup.isDistinct())) { //the current token is distinct from previous tokens - // markup the cached token group info startOffset = tokenGroup.matchStartOffset; endOffset = tokenGroup.matchEndOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); newText.append(markedUpText); lastEndOffset = Math.max(endOffset, lastEndOffset); tokenGroup.clear(); //check if current token marks the start of a new fragment if (textFragmenter.isNewFragment()) { currentFrag.setScore(fragmentScorer.getFragmentScore()); //record stats for a new fragment currentFrag.textEndPos = newText.length(); currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); } } tokenGroup.addToken(fragmentScorer.getTokenScore()); // if(lastEndOffset>maxDocBytesToAnalyze) // { // break; // } } currentFrag.setScore(fragmentScorer.getFragmentScore()); if (tokenGroup.numTokens > 0) { //flush the accumulated text (same code as in above loop) startOffset = tokenGroup.matchStartOffset; endOffset = tokenGroup.matchEndOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); newText.append(markedUpText); lastEndOffset = Math.max(lastEndOffset, endOffset); } //Test what remains of the original text beyond the point where we stopped analyzing if ( // if there is text beyond the last token considered.. (lastEndOffset < text.length()) && // and that text is not too large... (text.length() <= maxDocCharsToAnalyze)) { //append it to the last fragment newText.append(encoder.encodeText(text.substring(lastEndOffset))); } currentFrag.textEndPos = newText.length(); //sort the most relevant sections of the text for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) { currentFrag = i.next(); //If you are running with a version of Lucene before 11th Sept 03 // you do not have PriorityQueue.insert() - so uncomment the code below /* if (currentFrag.getScore() >= minScore) { fragQueue.put(currentFrag); if (fragQueue.size() > maxNumFragments) { // if hit queue overfull fragQueue.pop(); // remove lowest in hit queue minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore } } */ //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 //fix to PriorityQueue. The correct method to use here is the new "insert" method // USE ABOVE CODE IF THIS DOES NOT COMPILE! fragQueue.insertWithOverflow(currentFrag); } //return the most relevant fragments TextFragment frag[] = new TextFragment[fragQueue.size()]; for (int i = frag.length - 1; i >= 0; i--) { frag[i] = fragQueue.pop(); } //merge any contiguous fragments to improve readability if (mergeContiguousFragments) { mergeContiguousFragments(frag); ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>(); for (int i = 0; i < frag.length; i++) { if ((frag[i] != null) && (frag[i].getScore() > 0)) { fragTexts.add(frag[i]); } } frag = fragTexts.toArray(new TextFragment[0]); } return frag; } finally { if (tokenStream != null) { try { tokenStream.end(); tokenStream.close(); } catch (Exception e) { } } } }