List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:de.blizzy.documentr.search.PageIndex.java
License:Open Source License
private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication, IndexSearcher searcher) throws IOException, ParseException, TimeoutException { List<WordPosition> words = Lists.newArrayList(); TokenStream tokenStream = null; try {//from w w w . j a v a2 s .c o m tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.addAttribute(OffsetAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class); String text = charTerm.toString(); if (StringUtils.isNotBlank(text)) { OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class); WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset()); words.add(word); } } tokenStream.end(); } finally { Closeables.closeQuietly(tokenStream); } Collections.reverse(words); StringBuilder suggestedSearchText = new StringBuilder(searchText); StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText); boolean foundSuggestions = false; String now = String.valueOf(System.currentTimeMillis()); String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$ DirectSpellChecker spellChecker = new DirectSpellChecker(); IndexReader reader = searcher.getIndexReader(); for (WordPosition word : words) { Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord()); SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader, SuggestMode.SUGGEST_MORE_POPULAR); if (suggestions.length > 0) { String suggestedWord = suggestions[0].string; int start = word.getStart(); int end = word.getEnd(); suggestedSearchText.replace(start, end, suggestedWord); suggestedSearchTextHtml.replace(start, end, startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker); foundSuggestions = true; } } if (foundSuggestions) { String suggestion = suggestedSearchText.toString(); SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher); int suggestionTotalHits = suggestionResult.getTotalHits(); if (suggestionTotalHits > 0) { String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString()) .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$ return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits); } } return null; }
From source file:de.catma.indexer.PunctuationTokenizer.java
License:Open Source License
/** * Constructor.//from w w w . j a v a2 s . c o m * * @param input the input stream * @param unseparableCharacterSequences the list of unseparable character sequences * @param userDefSeparatingPunctuationPattern a pattern of OR-ed user defined * separating punctuation characters * @param locale the locale of the main language of the content */ public PunctuationTokenizer(TokenStream input, CharTree unseparableCharacterSequences, Pattern userDefSeparatingPunctuationPattern, Locale locale) { super(input); termInfoBuffer = new ArrayDeque<TermInfo>(); offsetAtt = (OffsetAttribute) input.getAttribute(OffsetAttribute.class); termAtt = (CharTermAttribute) input.getAttribute(CharTermAttribute.class); ucAtt = (UnseparableCharacterSequenceAttribute) input .getAttribute(UnseparableCharacterSequenceAttribute.class); treeRoot = unseparableCharacterSequences; if (treeRoot == null) { treeRoot = CharTree.EMPTY_TREE; } this.userDefSeparatingPunctuationPattern = userDefSeparatingPunctuationPattern; this.locale = locale; }
From source file:de.catma.indexer.TermExtractor.java
License:Open Source License
private void extractTermInfos(String content, List<String> unseparableCharacterSequences, List<Character> userDefinedSeparatingCharacters, Locale locale) throws IOException { terms = new HashMap<String, List<TermInfo>>(); termsInOrder = new ArrayList<String>(); if (locale == null) { locale = Locale.getDefault(); }//from w w w . j a va 2 s . c om WhitespaceAndPunctuationAnalyzer analyzer = new WhitespaceAndPunctuationAnalyzer( unseparableCharacterSequences, userDefinedSeparatingCharacters, locale); TokenStream ts = analyzer.tokenStream(null, // our analyzer does not use // the fieldname new StringReader(content)); int positionCounter = 0; while (ts.incrementToken()) { CharTermAttribute termAttr = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttr = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); TermInfo ti = new TermInfo(termAttr.toString(), offsetAttr.startOffset(), offsetAttr.endOffset(), positionCounter); if (!terms.containsKey(ti.getTerm())) { terms.put(ti.getTerm(), new ArrayList<TermInfo>()); } terms.get(ti.getTerm()).add(ti); positionCounter++; termsInOrder.add(ti.getTerm()); } }
From source file:de.catma.indexer.WildcardTermExtractor.java
License:Open Source License
private void extractTermInfosWithWildcards(String content, List<String> unseparableCharacterSequences, List<Character> userDefinedSeparatingCharacters, Locale locale) throws IOException { if (locale == null) { locale = Locale.getDefault(); }/* w w w . j a v a 2 s. com*/ WhitespaceAndPunctuationAnalyzer analyzer = new WhitespaceAndPunctuationAnalyzer( unseparableCharacterSequences, userDefinedSeparatingCharacters, locale); TokenStream ts = analyzer.tokenStream(null, // our analyzer does not use // the fieldname new StringReader(content)); WildcardParser wildcardParser = new WildcardParser(); while (ts.incrementToken()) { CharTermAttribute termAttr = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttr = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); wildcardParser.handle(termAttr, offsetAttr); } wildcardParser.finish(); orderedTerms = new ArrayList<String>(); for (TermInfo ti : wildcardParser.getOrderedTermInfos()) { orderedTerms.add(ti.getTerm()); } }
From source file:de.jetwick.es.JetwickQuery.java
License:Apache License
public Set<String> doSnowballStemming(TokenStream ts) { Set<String> res = new LinkedHashSet<String>(); ts = new SnowballFilter(ts, "English"); try {/*w w w. j ava 2s .c o m*/ while (ts.incrementToken()) { res.add(ts.getAttribute(TermAttribute.class).term()); } } catch (IOException ex) { logger.error("Exception while stemming to snoball", ex); } return res; }
From source file:de.mirkosertic.desktopsearch.SearchPhraseSuggester.java
License:Open Source License
private String analyze(String aFieldName, String aString) throws IOException { TokenStream theTokenStream = analyzer.tokenStream(aFieldName, aString); theTokenStream.reset();/*from ww w .j a v a 2 s . co m*/ CharTermAttribute theCharTerms = theTokenStream.getAttribute(CharTermAttribute.class); try { if (theTokenStream.incrementToken()) { return theCharTerms.toString(); } return null; } finally { theTokenStream.end(); theTokenStream.close(); } }
From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java
License:Open Source License
protected void addWildcardOrTermQueries(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer) throws IOException { Query theTempQuery;/*from w ww .j a v a 2s .c o m*/ TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm)); while (theTokenStream.incrementToken()) { TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class); String theTokenText = theTermAttribute.term(); if (isWildcardTerm(aTerm)) { theTempQuery = new WildcardQuery(new Term(aField, getCorrectedWildcardTerm(aTerm))); } else { theTempQuery = new TermQuery(new Term(aField, theTokenText)); } aQuery.add(theTempQuery, Occur.MUST); } }
From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java
License:Open Source License
protected void addPhraseQuery(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer) throws IOException { MultiPhraseQuery thePhraseQuery = new MultiPhraseQuery(); TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm)); while (theTokenStream.incrementToken()) { TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class); String theTokenText = theTermAttribute.term(); Term theTerm = new Term(aField, theTokenText); if (!isWildcardTerm(theTokenText)) { thePhraseQuery.add(theTerm); } else {/* ww w .ja v a 2 s . c o m*/ Term theWildcardTerm = new Term(theTerm.field(), getCorrectedWildcardTerm(theTerm.text())); WildcardTermEnum theEnum = new WildcardTermEnum(reader, theWildcardTerm); try { List<Term> theTerms = new ArrayList<Term>(); do { theTerms.add(theEnum.term()); } while (theEnum.next()); thePhraseQuery.add(theTerms.toArray(new Term[0])); } finally { theEnum.close(); } } } aQuery.add(thePhraseQuery, Occur.MUST); }
From source file:de.twitterlivesearch.analysis.Tokenizer.java
License:Apache License
/** * @param stringToAnalyze//from w w w .j av a 2 s.co m * String to be tokenized * @param {@link org.apache.lucene.analysis.Analyzer Analyzer} to be used * for analysis * * @return list of tokens */ public static List<String> getTokensForString(String stringToAnalyze, Analyzer analyzer) { List<String> tokens = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(stringToAnalyze)); stream.reset(); while (stream.incrementToken()) { tokens.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return tokens; }
From source file:de.unidue.inf.is.ezdl.dlcore.data.extractor.TermExtractor.java
License:Open Source License
/** * Split the information cause in sense of term it is a standalone word. * TODO this method removes stopwords but don't detect any phrases. * /*from w w w.j a va 2 s . co m*/ * @param result * the list we will append the items * @param item * the item itself. */ private void add(ExtractionResultImpl result, String item) { if (item != null) { inferLanguage(item); List<String> terms = new ArrayList<String>(); TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_30, new StringReader(item)); // OffsetAttribute offsetAttribute = // tokenStream.getAttribute(OffsetAttribute.class); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); try { while (tokenStream.incrementToken()) { // int startOffset = offsetAttribute.startOffset(); // int endOffset = offsetAttribute.endOffset(); String term = termAttribute.term(); terms.add(term); } } catch (IOException e) { logger.error(e.getMessage(), e); } terms = filter.filter(terms, locale); for (String t : terms) { if (!StringUtils.isEmpty((t))) { Entry e = new EntryImpl(t.toLowerCase(locale)); result.add(e); } } } }