List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:org.sc.probro.lucene.BiothesaurusSearcher.java
License:Apache License
public String[] tokenize(String input) { ArrayList<String> tokens = new ArrayList<String>(); try {/*from w w w .ja v a2 s. c o m*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(input)); TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class); //stream = new LowerCaseFilter(stream); stream.reset(); while (stream.incrementToken()) { if (stream.hasAttribute(TermAttribute.class)) { String term = termattr.term(); tokens.add(term); } } stream.end(); stream.close(); } catch (IllegalArgumentException e) { System.err.println(String.format("Phrase: \"%s\"", input)); e.printStackTrace(System.err); } catch (IOException e) { System.err.println(String.format("Phrase: \"%s\"", input)); e.printStackTrace(); } return tokens.toArray(new String[0]); }
From source file:org.sc.probro.lucene.BiothesaurusSearcher.java
License:Apache License
public Query createPhraseQuery(String field, String phrase) throws IOException { PhraseQuery query = new PhraseQuery(); /*/*w ww. j a va2s . c o m*/ String[] array = phrase.split("\\s+"); for(int i = 0; i < array.length; i++) { query.add(new Term(field, array[i])); } */ try { TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase)); //stream = new LowerCaseFilter(stream); stream.reset(); while (stream.incrementToken()) { if (stream.hasAttribute(TermAttribute.class)) { TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class); Term t = new Term(field, termattr.term()); query.add(t); } } stream.end(); stream.close(); } catch (IllegalArgumentException e) { e.printStackTrace(System.err); System.err.println(String.format("Phrase: \"%s\"", phrase)); } return query; }
From source file:org.sc.probro.lucene.ProteinSearcher.java
License:Apache License
public String[] tokenize(String input) { ArrayList<String> tokens = new ArrayList<String>(); try {/*from ww w . j a v a 2 s .com*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(input)); stream = new LowerCaseFilter(stream); stream.reset(); while (stream.incrementToken()) { if (stream.hasAttribute(TermAttribute.class)) { TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class); String term = termattr.term(); tokens.add(term); } } stream.end(); stream.close(); } catch (IllegalArgumentException e) { System.err.println(String.format("Phrase: \"%s\"", input)); e.printStackTrace(System.err); } catch (IOException e) { System.err.println(String.format("Phrase: \"%s\"", input)); e.printStackTrace(); } return tokens.toArray(new String[0]); }
From source file:org.sc.probro.lucene.ProteinSearcher.java
License:Apache License
public Query createPhraseQuery(String field, String phrase) throws IOException { PhraseQuery query = new PhraseQuery(); /*//from www.j a v a 2 s . c om String[] array = phrase.split("\\s+"); for(int i = 0; i < array.length; i++) { query.add(new Term(field, array[i])); } */ try { TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase)); stream = new LowerCaseFilter(stream); stream.reset(); while (stream.incrementToken()) { if (stream.hasAttribute(TermAttribute.class)) { TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class); Term t = new Term(field, termattr.term()); query.add(t); } } stream.end(); stream.close(); } catch (IllegalArgumentException e) { e.printStackTrace(System.err); System.err.println(String.format("Phrase: \"%s\"", phrase)); } return query; }
From source file:org.sd.text.lucene.LuceneUtils.java
License:Open Source License
/** * Split the string into tokens using the given analyzer. *//*from w ww.ja v a 2 s. co m*/ public static final List<String> getTokenTexts(Analyzer analyzer, String fieldName, String string) { if (string == null) return null; final List<String> result = new ArrayList<String>(); if (analyzer != null) { final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string)); try { while (tokenStream.incrementToken()) { if (tokenStream.hasAttribute(TermAttribute.class)) { final TermAttribute termAttribute = (TermAttribute) tokenStream .getAttribute(TermAttribute.class); result.add(termAttribute.term()); } } tokenStream.close(); } catch (IOException e) { throw new IllegalStateException(e); } } else { result.add(string); } return result; }
From source file:org.sd.text.lucene.LuceneUtils.java
License:Open Source License
public static final List<List<String>> getPhraseTexts(Analyzer analyzer, String fieldName, String string) { if (string == null) return null; final List<List<String>> result = new LinkedList<List<String>>(); List<String> curPhrase = new ArrayList<String>(); result.add(curPhrase);/*ww w. j a va2 s . c o m*/ if (analyzer != null) { final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string)); int lastEndOffset = 0; try { while (tokenStream.incrementToken()) { boolean incPhrase = true; if (tokenStream.hasAttribute(OffsetAttribute.class)) { final OffsetAttribute offsetAttribute = (OffsetAttribute) tokenStream .getAttribute(OffsetAttribute.class); if (offsetAttribute.startOffset() == lastEndOffset) { incPhrase = false; } lastEndOffset = offsetAttribute.endOffset(); } if (tokenStream.hasAttribute(TermAttribute.class)) { final TermAttribute termAttribute = (TermAttribute) tokenStream .getAttribute(TermAttribute.class); if (incPhrase && curPhrase.size() > 0) { curPhrase = new ArrayList<String>(); result.add(curPhrase); } curPhrase.add(termAttribute.term()); } } tokenStream.close(); } catch (IOException e) { throw new IllegalStateException(e); } } else { curPhrase.add(string); } return result; }
From source file:org.sd.text.lucene.LuceneUtils.java
License:Open Source License
/** * Build a phrase query from the tokens in the given string using the given * analyzer.// w w w . j av a 2 s .c om * <p> * Use a BooleanClause.Occur.MUST for exact matches and BooleanClause.Occur.SHOULD * for fuzzy matches. */ public static final Query toQuery(Analyzer analyzer, String fieldName, String string, Collection<String> termCollector, BooleanClause.Occur occur) { Query result = null; if (analyzer != null) { final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string)); BooleanQuery booleanQuery = null; PhraseQuery phraseQuery = null; int lastEndOffset = 0; try { while (tokenStream.incrementToken()) { if (tokenStream.hasAttribute(TermAttribute.class)) { final TermAttribute termAttribute = (TermAttribute) tokenStream .getAttribute(TermAttribute.class); final String term = termAttribute.term(); // check offset attribute if (tokenStream.hasAttribute(OffsetAttribute.class)) { final OffsetAttribute offsetAttribute = (OffsetAttribute) tokenStream .getAttribute(OffsetAttribute.class); if (offsetAttribute.startOffset() != lastEndOffset) { // time to increment phrase if (phraseQuery != null) { if (booleanQuery == null) booleanQuery = new BooleanQuery(); booleanQuery.add(phraseQuery, occur); phraseQuery = null; } } lastEndOffset = offsetAttribute.endOffset(); } if (phraseQuery == null) phraseQuery = new PhraseQuery(); phraseQuery.add(new Term(fieldName, term)); if (termCollector != null) termCollector.add(term); } } } catch (IOException e) { throw new IllegalStateException(e); } if (phraseQuery != null) { if (booleanQuery == null) booleanQuery = new BooleanQuery(); booleanQuery.add(phraseQuery, BooleanClause.Occur.SHOULD); } result = booleanQuery; } if (result == null) { result = new TermQuery(new Term(fieldName, string)); if (termCollector != null) termCollector.add(string); } return result; }
From source file:org.sd.text.lucene.TestSdTokenStream.java
License:Open Source License
private final void verifyTokens(TokenStream tokenStream, String[] expected) throws IOException { int index = 0; Token token = new Token(); while (tokenStream.incrementToken()) { final TermAttribute termAttribute = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); final String term = termAttribute.term(); if (expected != null) { assertTrue("got " + term + " at index=" + index, index < expected.length); assertEquals("got " + term + " at index=" + index, expected[index], term); } else {/*from ww w. java2 s .co m*/ System.out.println(index + ":" + term); } ++index; } if (expected != null) { assertEquals(expected.length, index); } }
From source file:org.sift.runtime.impl.LuceneWordSplitterProcessor.java
License:Apache License
/** * Interface method implementation. Emits words as {@link Tuple} values by applying the configured Lucene {@link Analyzer} on specified {@link Tuple} values * @see org.sift.runtime.spi.Processor#process(org.sift.runtime.Tuple, org.sift.runtime.spi.OutputCollector) */// w ww . j a v a2 s.c om public void process(Tuple tuple, OutputCollector collector) { Tuple returnTuple = tuple.clone(); for (Object line : tuple.getList(Fields.VALUES)) { List<String> tokensList = new LinkedList<String>(); try { TokenStream stream = this.analyzer.tokenStream(null, new StringReader(((String) line).toLowerCase())); while (stream.incrementToken()) { tokensList.add(((TermAttribute) stream.getAttribute(TermAttribute.class)).term()); } } catch (IOException e) { throw new RuntimeException("Error parsing input line : " + line, e); } String[] tokens = tokensList.toArray(new String[0]); for (int i = 0; i < tokens.length; i++) { StringBuffer tokenBuffer = new StringBuffer(); for (int j = 0; j < this.getnGram(); j++) { if (i + j < tokens.length) { tokenBuffer.append(tokens[i + j]); tokenBuffer.append(StopWords.WORD_BOUNDARY_STRING); } String word = tokenBuffer.toString().trim(); if (this.getStopWords() != null && !this.getStopWords().isStopWord(word)) { returnTuple.addToList(Fields.VALUES, tokenBuffer.toString().trim()); } } } } collector.emit(returnTuple); }
From source file:org.sindice.siren.analysis.filter.URILocalnameFilter.java
License:Apache License
/** * For testing purpose// w w w .j a v a 2 s.com */ public static void main(final String[] args) throws IOException { final TupleTokenizer stream = new TupleTokenizer( new StringReader("" + "<mailto:renaud.delbru@deri.org> <http://renaud.delbru.fr/rdf/foaf> " + "<http://renaud.delbru.fr/> <http://xmlns.com/foaf/0.1/workplaceHomepage> " + "<http://test.com/M%C3%B6ller>"), Integer.MAX_VALUE, new WhitespaceAnalyzer(Version.LUCENE_31)); final TokenStream result = new URILocalnameFilter(stream); final CharTermAttribute termAtt = result.getAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = result.getAttribute(PositionIncrementAttribute.class); while (result.incrementToken()) { System.out.println(termAtt.toString() + ", " + posIncrAtt.getPositionIncrement()); } }