List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:de.berlinbuzzwords.AnalyzerPrinter.java
License:Apache License
public void printTerms(Analyzer analyzer, String text) throws IOException { // Create token stream from reader TokenStream stream = analyzer.tokenStream("dummyField", new StringReader(text)); // Reset stream before token consumption stream.reset();/*from ww w. j av a 2s. co m*/ // Attribute to get the term text for a token CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); // Output source text System.out.println("\ntext: " + text); // Analyze text and iterate until end of input while (stream.incrementToken()) { // Output term text System.out.println(" term: " + termAttr); } }
From source file:de.berlinbuzzwords.AnalyzerPrinter.java
License:Apache License
public void printTokenDetails(Analyzer analyzer, String text) throws IOException { // Create token stream from reader TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text)); // Reset stream before token consumption stream.reset();//ww w. j a va 2s . c om // Attribute to get the token term text, offset and type CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class); TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class); // Output source text System.out.println("text: " + text); // Analyze text and iterate until end of input while (stream.incrementToken()) { // Output term text, type and offset System.out.println("term: " + termAttr + "\ttype: " + typeAttr.type() + "\tstart offset: " + offsetAttr.startOffset() + "\tend offset: " + offsetAttr.endOffset()); } }
From source file:de.blizzy.documentr.search.PageFinder.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null; try {/*from w w w. j a v a2 s . c om*/ tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Util.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }
From source file:de.blizzy.documentr.search.PageIndex.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null; try {//from ww w .j av a 2s . c o m tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Closeables.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }
From source file:de.csw.ontology.XWikiTextEnhancer.java
License:Open Source License
/** * The enhanced text contains links to the Lucene search page of the xWiki * system. The search terms are related to the annotated phrase. *//* w w w . j a v a2 s . c o m*/ public String enhance(String text) { CSWGermanAnalyzer ga = new CSWGermanAnalyzer(); TokenStream ts = null; StringBuilder result = new StringBuilder(); initializeLinkIndex(text); try { Reader r = new BufferedReader(new StringReader(text)); ts = ga.tokenStream("", r); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); TypeAttribute typeAttribute = ts.addAttribute(TypeAttribute.class); String term; int lastEndIndex = 0; while (ts.incrementToken()) { result.append(text.substring(lastEndIndex, offsetAttribute.startOffset())); term = String.copyValueOf(charTermAttribute.buffer(), 0, charTermAttribute.length()); if (typeAttribute.type().equals(ConceptFilter.CONCEPT_TYPE) && isAnnotatable(offsetAttribute)) { log.debug("Annotating concept: " + term); annotateWithSearch(result, text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset()), term); } else { result.append(text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset())); } lastEndIndex = offsetAttribute.endOffset(); } result.append(text.subSequence(lastEndIndex, text.length())); } catch (IOException e) { Log.error("Error while processing the page content", e); } ga.close(); return result.toString(); }
From source file:de.escidoc.sb.common.lucene.analyzer.EscidocJapaneseAnalyzer.java
License:Open Source License
/** * Constructs a token stream with JapaneseAnalyzer or WhitespaceTokenizer * depending if text is japanese or not. * //from www . j ava 2 s .c o m * @param fieldName * name of the Lucene Indexfield. * @param reader * reader with field-value * * @return TokenStream tokenStream * * @sb */ @Override public TokenStream tokenStream(final String fieldName, final Reader reader) { if (log.isDebugEnabled()) { log.debug("tokenizing with EscidocJapaneseAnalyzer"); } //checkJapanese /////////////////////////////////////////////////////// boolean isJapanese = false; TokenStream whitespaceTokens = new WhitespaceTokenizer(Constants.LUCENE_VERSION, reader); Reader reader1 = null; try { StringBuffer tokenBuffer = new StringBuffer(""); CharTermAttribute termAtt = whitespaceTokens.addAttribute(CharTermAttribute.class); whitespaceTokens.reset(); while (whitespaceTokens.incrementToken()) { if (tokenBuffer.length() > 0) { tokenBuffer.append(" "); } tokenBuffer.append(termAtt.toString()); } for (int i = 0; i < tokenBuffer.length(); i++) { int hexInt = Integer.parseInt(charToHex(tokenBuffer.charAt(i)), 16); if (hexInt > 12287 && hexInt < 13328) { isJapanese = true; break; } } reader1 = new StringReader(tokenBuffer.toString()); } catch (Exception e) { log.error(e); } /////////////////////////////////////////////////////////////////////// //No Japanese, so return whitespace-tokens if (!isJapanese) { TokenStream result = new XmlWhitespaceTokenizer(reader1); result = new JunkFilter(result); result = new LowerCaseFilter(Constants.LUCENE_VERSION, result); return result; } //Get Japanese Tokens JapaneseAnalyzer analyzer = new JapaneseAnalyzer(Constants.LUCENE_VERSION); TokenStream japaneseTokens = analyzer.tokenStream("", reader1); if (analyzer != null) { try { analyzer.close(); } catch (Exception e) { } } return japaneseTokens; }
From source file:de.escidoc.sb.common.lucene.analyzer.Test.java
License:Open Source License
/** * @param args//from w ww. j ava 2 s . co m */ public static void main(String[] args) { try { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); TokenStream ts = analyzer.tokenStream("myfield", new StringReader("ome text goes here")); ts = new GreekFilter(ts); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { System.out.println("token: " + termAtt.toString()); } } catch (Exception e) { System.out.println(e); } }
From source file:de.ingrid.interfaces.csw.tools.LuceneTools.java
License:EUPL
/** * @param term//from ww w .j av a 2 s . c om * @return filtered term * @throws IOException */ public String filterTerm(String term) throws IOException { String result = ""; // always use same analyzer, NOT new instance ! Is called in mapping process ! Analyzer myAnalyzer = getAnalyzer(); TokenStream ts = myAnalyzer.tokenStream(null, new StringReader(term)); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { String t = charTermAttribute.toString(); result = result + " " + t; } return result.trim(); }
From source file:di.uniba.it.tri.occ.BuildOccurrence.java
License:Open Source License
private List<String> getTokens(Reader reader) throws IOException { List<String> tokens = new ArrayList<>(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); TokenStream tokenStream = analyzer.tokenStream("text", reader); tokenStream.reset();/*from w w w .ja v a 2s . c o m*/ CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { String token = cattr.toString(); String[] split = token.split("'"); if (split.length == 1) { tokens.add(token); } else { int max = 0; int index = 0; for (int i = 0; i < split.length; i++) { if (split[i].length() > max) { max = split[i].length(); index = i; } } tokens.add(split[index]); } } tokenStream.end(); return tokens; }
From source file:doc2vec.LuceneDocIterator.java
String preProcess(Analyzer analyzer, String text) throws Exception { StringBuffer tokenizedContentBuff = new StringBuffer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();/*from w w w . ja va2 s. co m*/ while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); if (labelsStoredWithWords) { term = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM)[0]; // the first part is the word } if (!term.trim().equals("")) tokenizedContentBuff.append(term).append(" "); } stream.end(); stream.close(); return tokenizedContentBuff.toString(); }