List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:de.berlinbuzzwords.AnalyzerPrinter.java
License:Apache License
public void printTokenDetails(Analyzer analyzer, String text) throws IOException { // Create token stream from reader TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text)); // Reset stream before token consumption stream.reset(); // Attribute to get the token term text, offset and type CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class); TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class); // Output source text System.out.println("text: " + text); // Analyze text and iterate until end of input while (stream.incrementToken()) { // Output term text, type and offset System.out.println("term: " + termAttr + "\ttype: " + typeAttr.type() + "\tstart offset: " + offsetAttr.startOffset() + "\tend offset: " + offsetAttr.endOffset()); }// ww w . j ava 2s . c o m }
From source file:de.blizzy.documentr.search.PageFinder.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null; try {//from w ww . ja v a 2 s . c om tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Util.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }
From source file:de.blizzy.documentr.search.PageIndex.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null; try {/*from w ww .j a v a 2 s . c o m*/ tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Closeables.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }
From source file:de.catma.indexer.WhitespaceAndPunctuationAnalyzer.java
License:Open Source License
@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { TokenStream tokenizer = (TokenStream) getPreviousTokenStream(); if (tokenizer == null) { tokenizer = new PunctuationTokenizer( new CatmaWhitespaceTokenizer(reader, unseparableCharacterSequences), unseparableCharacterSequences, userDefSeparatingPunctuationPattern, locale); setPreviousTokenStream(tokenizer); } else {/*from w ww . j av a2 s . c o m*/ tokenizer.reset(); } return tokenizer; }
From source file:de.escidoc.sb.common.lucene.analyzer.EscidocJapaneseAnalyzer.java
License:Open Source License
/** * Constructs a token stream with JapaneseAnalyzer or WhitespaceTokenizer * depending if text is japanese or not. * /*www . j av a 2 s . co m*/ * @param fieldName * name of the Lucene Indexfield. * @param reader * reader with field-value * * @return TokenStream tokenStream * * @sb */ @Override public TokenStream tokenStream(final String fieldName, final Reader reader) { if (log.isDebugEnabled()) { log.debug("tokenizing with EscidocJapaneseAnalyzer"); } //checkJapanese /////////////////////////////////////////////////////// boolean isJapanese = false; TokenStream whitespaceTokens = new WhitespaceTokenizer(Constants.LUCENE_VERSION, reader); Reader reader1 = null; try { StringBuffer tokenBuffer = new StringBuffer(""); CharTermAttribute termAtt = whitespaceTokens.addAttribute(CharTermAttribute.class); whitespaceTokens.reset(); while (whitespaceTokens.incrementToken()) { if (tokenBuffer.length() > 0) { tokenBuffer.append(" "); } tokenBuffer.append(termAtt.toString()); } for (int i = 0; i < tokenBuffer.length(); i++) { int hexInt = Integer.parseInt(charToHex(tokenBuffer.charAt(i)), 16); if (hexInt > 12287 && hexInt < 13328) { isJapanese = true; break; } } reader1 = new StringReader(tokenBuffer.toString()); } catch (Exception e) { log.error(e); } /////////////////////////////////////////////////////////////////////// //No Japanese, so return whitespace-tokens if (!isJapanese) { TokenStream result = new XmlWhitespaceTokenizer(reader1); result = new JunkFilter(result); result = new LowerCaseFilter(Constants.LUCENE_VERSION, result); return result; } //Get Japanese Tokens JapaneseAnalyzer analyzer = new JapaneseAnalyzer(Constants.LUCENE_VERSION); TokenStream japaneseTokens = analyzer.tokenStream("", reader1); if (analyzer != null) { try { analyzer.close(); } catch (Exception e) { } } return japaneseTokens; }
From source file:de.escidoc.sb.common.lucene.analyzer.Test.java
License:Open Source License
/** * @param args// www. jav a 2 s . c o m */ public static void main(String[] args) { try { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); TokenStream ts = analyzer.tokenStream("myfield", new StringReader("ome text goes here")); ts = new GreekFilter(ts); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { System.out.println("token: " + termAtt.toString()); } } catch (Exception e) { System.out.println(e); } }
From source file:de.mirkosertic.desktopsearch.SearchPhraseSuggester.java
License:Open Source License
private String analyze(String aFieldName, String aString) throws IOException { TokenStream theTokenStream = analyzer.tokenStream(aFieldName, aString); theTokenStream.reset(); CharTermAttribute theCharTerms = theTokenStream.getAttribute(CharTermAttribute.class); try {/*from w ww.ja va 2 s . c o m*/ if (theTokenStream.incrementToken()) { return theCharTerms.toString(); } return null; } finally { theTokenStream.end(); theTokenStream.close(); } }
From source file:de.twitterlivesearch.analysis.Tokenizer.java
License:Apache License
/** * @param stringToAnalyze//ww w. ja v a2 s. c o m * String to be tokenized * @param {@link org.apache.lucene.analysis.Analyzer Analyzer} to be used * for analysis * * @return list of tokens */ public static List<String> getTokensForString(String stringToAnalyze, Analyzer analyzer) { List<String> tokens = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(stringToAnalyze)); stream.reset(); while (stream.incrementToken()) { tokens.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return tokens; }
From source file:de.uni_koeln.spinfo.maalr.lucene.util.TokenizerHelper.java
License:Apache License
public static String tokenizeString(Analyzer analyzer, String string) { // Inspired by stackoverflow: // http://stackoverflow.com/questions/6334692/how-to-use-a-lucene-analyzer-to-tokenize-a-string StringBuilder builder = new StringBuilder(); try {// www.j a v a 2s . com TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { builder.append(stream.getAttribute(CharTermAttribute.class).toString()); builder.append(" "); } stream.close(); } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return builder.toString().trim(); }
From source file:dependencies.ReviewDependencyAnalyzer.java
License:Open Source License
@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { TokenStream tok_str = (TokenStream) getPreviousTokenStream(); if (tok_str == null) { tok_str = tokenStream(fieldName, reader); setPreviousTokenStream(tok_str); } else {//from w ww.j av a 2 s . c om source_document.reset(reader); tok_str.reset(); } return tok_str; }