List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:ddf.catalog.pubsub.criteria.contextual.ContextualEvaluator.java
License:Open Source License
private static void logTokens(Analyzer analyzer, String fieldName, String fullDocument, String analyzerName) throws IOException { if (!LOGGER.isDebugEnabled()) { return;/*w ww . ja va 2 s . c om*/ } TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(fullDocument)); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); LOGGER.debug("----- {} tokens -----", analyzerName); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = termAttribute.term(); LOGGER.debug(term); } LOGGER.debug("----- END: {} tokens -----", analyzerName); }
From source file:de.berlinbuzzwords.AnalyzerPrinter.java
License:Apache License
public void printTerms(Analyzer analyzer, String text) throws IOException { // Create token stream from reader TokenStream stream = analyzer.tokenStream("dummyField", new StringReader(text)); // Reset stream before token consumption stream.reset();/*w w w .j av a 2s. c om*/ // Attribute to get the term text for a token CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); // Output source text System.out.println("\ntext: " + text); // Analyze text and iterate until end of input while (stream.incrementToken()) { // Output term text System.out.println(" term: " + termAttr); } }
From source file:de.berlinbuzzwords.AnalyzerPrinter.java
License:Apache License
public void printTokenDetails(Analyzer analyzer, String text) throws IOException { // Create token stream from reader TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text)); // Reset stream before token consumption stream.reset();// w w w . j ava 2s .c o m // Attribute to get the token term text, offset and type CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class); TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class); // Output source text System.out.println("text: " + text); // Analyze text and iterate until end of input while (stream.incrementToken()) { // Output term text, type and offset System.out.println("term: " + termAttr + "\ttype: " + typeAttr.type() + "\tstart offset: " + offsetAttr.startOffset() + "\tend offset: " + offsetAttr.endOffset()); } }
From source file:de.blizzy.documentr.search.PageFinder.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null; try {/*from www. j a va 2 s. c om*/ tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Util.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }
From source file:de.blizzy.documentr.search.PageIndex.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null; try {/*from ww w . j a v a 2 s .com*/ tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Closeables.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }
From source file:de.catma.indexer.TermExtractor.java
License:Open Source License
private void extractTermInfos(String content, List<String> unseparableCharacterSequences, List<Character> userDefinedSeparatingCharacters, Locale locale) throws IOException { terms = new HashMap<String, List<TermInfo>>(); termsInOrder = new ArrayList<String>(); if (locale == null) { locale = Locale.getDefault(); }/*www. ja v a 2s . c om*/ WhitespaceAndPunctuationAnalyzer analyzer = new WhitespaceAndPunctuationAnalyzer( unseparableCharacterSequences, userDefinedSeparatingCharacters, locale); TokenStream ts = analyzer.tokenStream(null, // our analyzer does not use // the fieldname new StringReader(content)); int positionCounter = 0; while (ts.incrementToken()) { CharTermAttribute termAttr = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttr = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); TermInfo ti = new TermInfo(termAttr.toString(), offsetAttr.startOffset(), offsetAttr.endOffset(), positionCounter); if (!terms.containsKey(ti.getTerm())) { terms.put(ti.getTerm(), new ArrayList<TermInfo>()); } terms.get(ti.getTerm()).add(ti); positionCounter++; termsInOrder.add(ti.getTerm()); } }
From source file:de.catma.indexer.WildcardTermExtractor.java
License:Open Source License
private void extractTermInfosWithWildcards(String content, List<String> unseparableCharacterSequences, List<Character> userDefinedSeparatingCharacters, Locale locale) throws IOException { if (locale == null) { locale = Locale.getDefault(); }/*from w w w. j av a2 s .c om*/ WhitespaceAndPunctuationAnalyzer analyzer = new WhitespaceAndPunctuationAnalyzer( unseparableCharacterSequences, userDefinedSeparatingCharacters, locale); TokenStream ts = analyzer.tokenStream(null, // our analyzer does not use // the fieldname new StringReader(content)); WildcardParser wildcardParser = new WildcardParser(); while (ts.incrementToken()) { CharTermAttribute termAttr = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttr = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); wildcardParser.handle(termAttr, offsetAttr); } wildcardParser.finish(); orderedTerms = new ArrayList<String>(); for (TermInfo ti : wildcardParser.getOrderedTermInfos()) { orderedTerms.add(ti.getTerm()); } }
From source file:de.csw.ontology.XWikiTextEnhancer.java
License:Open Source License
/** * The enhanced text contains links to the Lucene search page of the xWiki * system. The search terms are related to the annotated phrase. */// w w w. j a va 2 s. c om public String enhance(String text) { CSWGermanAnalyzer ga = new CSWGermanAnalyzer(); TokenStream ts = null; StringBuilder result = new StringBuilder(); initializeLinkIndex(text); try { Reader r = new BufferedReader(new StringReader(text)); ts = ga.tokenStream("", r); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAttribute = ts.addAttribute(TypeAttribute.class); String term; int lastEndIndex = 0; while (ts.incrementToken()) { result.append(text.substring(lastEndIndex, offsetAttribute.startOffset())); term = String.copyValueOf(charTermAttribute.buffer(), 0, charTermAttribute.length()); if (typeAttribute.type().equals(ConceptFilter.CONCEPT_TYPE) && isAnnotatable(offsetAttribute)) { log.debug("Annotating concept: " + term); annotateWithSearch(result, text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset()), term); } else { result.append(text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset())); } lastEndIndex = offsetAttribute.endOffset(); } result.append(text.subSequence(lastEndIndex, text.length())); } catch (IOException e) { Log.error("Error while processing the page content", e); } ga.close(); return result.toString(); }
From source file:de.escidoc.sb.common.lucene.analyzer.EscidocJapaneseAnalyzer.java
License:Open Source License
/** * Constructs a token stream with JapaneseAnalyzer or WhitespaceTokenizer * depending if text is japanese or not. * /*from w ww. j a v a 2s . c o m*/ * @param fieldName * name of the Lucene Indexfield. * @param reader * reader with field-value * * @return TokenStream tokenStream * * @sb */ @Override public TokenStream tokenStream(final String fieldName, final Reader reader) { if (log.isDebugEnabled()) { log.debug("tokenizing with EscidocJapaneseAnalyzer"); } //checkJapanese /////////////////////////////////////////////////////// boolean isJapanese = false; TokenStream whitespaceTokens = new WhitespaceTokenizer(Constants.LUCENE_VERSION, reader); Reader reader1 = null; try { StringBuffer tokenBuffer = new StringBuffer(""); CharTermAttribute termAtt = whitespaceTokens.addAttribute(CharTermAttribute.class); whitespaceTokens.reset(); while (whitespaceTokens.incrementToken()) { if (tokenBuffer.length() > 0) { tokenBuffer.append(" "); } tokenBuffer.append(termAtt.toString()); } for (int i = 0; i < tokenBuffer.length(); i++) { int hexInt = Integer.parseInt(charToHex(tokenBuffer.charAt(i)), 16); if (hexInt > 12287 && hexInt < 13328) { isJapanese = true; break; } } reader1 = new StringReader(tokenBuffer.toString()); } catch (Exception e) { log.error(e); } /////////////////////////////////////////////////////////////////////// //No Japanese, so return whitespace-tokens if (!isJapanese) { TokenStream result = new XmlWhitespaceTokenizer(reader1); result = new JunkFilter(result); result = new LowerCaseFilter(Constants.LUCENE_VERSION, result); return result; } //Get Japanese Tokens JapaneseAnalyzer analyzer = new JapaneseAnalyzer(Constants.LUCENE_VERSION); TokenStream japaneseTokens = analyzer.tokenStream("", reader1); if (analyzer != null) { try { analyzer.close(); } catch (Exception e) { } } return japaneseTokens; }
From source file:de.escidoc.sb.common.lucene.analyzer.Test.java
License:Open Source License
/** * @param args/* ww w. j a v a 2s . c o m*/ */ public static void main(String[] args) { try { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); TokenStream ts = analyzer.tokenStream("myfield", new StringReader("ome text goes here")); ts = new GreekFilter(ts); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { System.out.println("token: " + termAtt.toString()); } } catch (Exception e) { System.out.println(e); } }