List of usage examples for org.apache.lucene.analysis TokenStream getAttribute
public final <T extends Attribute> T getAttribute(Class<T> attClass)
The caller must pass in a Class<?
From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java
License:Apache License
@Override public Query wildcardQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, @Nullable QueryParseContext context) { // Use HashSplitterSearch* analysis and post-process it to create the real query TokenStream tok = null; try {//from w ww . ja v a2 s. c o m tok = searchAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value)); tok.reset(); } catch (IOException e) { return null; } CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); BooleanQuery q = new BooleanQuery(); try { while (tok.incrementToken()) { q.add(new WildcardQuery(names().createIndexNameTerm(termAtt.toString()), wildcardOne, wildcardAny), BooleanClause.Occur.MUST); } tok.end(); tok.close(); } catch (IOException e) { e.printStackTrace(); q = null; } return q; }
From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java
License:Apache License
@Override public Filter wildcardFilter(String value, @Nullable QueryParseContext context) { // Use HashSplitterSearch* analysis and post-process it to create the real query TokenStream tok = null; try {/* w w w. j a v a 2s. c om*/ tok = searchAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value)); tok.reset(); } catch (IOException e) { return null; } CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class); BooleanFilter f = new BooleanFilter(); try { while (tok.incrementToken()) { f.add(new WildcardFilter(names().createIndexNameTerm(termAtt.toString()), wildcardOne, wildcardAny), BooleanClause.Occur.MUST); } tok.end(); tok.close(); } catch (IOException e) { e.printStackTrace(); f = null; } return f; }
From source file:org.elasticsearch.search.highlight.PlainHighlighter.java
License:Apache License
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, TokenStream tokenStream) throws IOException { try {/*www.j av a 2s .c om*/ if (!tokenStream.hasAttribute(OffsetAttribute.class)) { // Can't split on term boundaries without offsets return -1; } int end = -1; tokenStream.reset(); while (tokenStream.incrementToken()) { OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class); if (attr.endOffset() >= noMatchSize) { // Jump to the end of this token if it wouldn't put us past the boundary if (attr.endOffset() == noMatchSize) { end = noMatchSize; } return end; } end = attr.endOffset(); } // We've exhausted the token stream so we should just highlight everything. return end; } finally { tokenStream.end(); tokenStream.close(); } }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTest.java
License:Apache License
@Test public void testSuggestTokenFilterProperlyDelegateInputStream() throws Exception { TokenStream tokenStream = new MockTokenizer(new StringReader("mykeyword"), MockTokenizer.WHITESPACE, true); BytesRef payload = new BytesRef("Surface keyword|friggin payload|10"); TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter( new CompletionTokenStream(tokenStream, payload, new CompletionTokenStream.ToFiniteStrings() { @Override//from ww w . j a v a2 s . c o m public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { return suggester.toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream); } })); TermToBytesRefAttribute termAtt = suggestTokenStream.getAttribute(TermToBytesRefAttribute.class); BytesRef ref = termAtt.getBytesRef(); assertNotNull(ref); suggestTokenStream.reset(); while (suggestTokenStream.incrementToken()) { termAtt.fillBytesRef(); assertThat(ref.utf8ToString(), equalTo("mykeyword")); } suggestTokenStream.end(); suggestTokenStream.close(); }
From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java
License:Apache License
@Test public void testSuggestTokenFilterProperlyDelegateInputStream() throws Exception { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); tokenizer.setReader(new StringReader("mykeyword")); BytesRef payload = new BytesRef("Surface keyword|friggin payload|10"); TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter( new CompletionTokenStream(tokenizer, payload, new CompletionTokenStream.ToFiniteStrings() { @Override//from ww w .j a va 2 s. co m public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException { return suggester.toFiniteStrings(stream); } })); TermToBytesRefAttribute termAtt = suggestTokenStream.getAttribute(TermToBytesRefAttribute.class); assertNotNull(termAtt.getBytesRef()); suggestTokenStream.reset(); while (suggestTokenStream.incrementToken()) { assertThat(termAtt.getBytesRef().utf8ToString(), equalTo("mykeyword")); } suggestTokenStream.end(); suggestTokenStream.close(); }
From source file:org.elasticsearch.test.unit.index.analysis.AnalysisTestsHelper.java
License:Apache License
public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset();/*w ww. j ava 2 s . c o m*/ CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); Assert.assertNotNull(termAttr); int i = 0; while (stream.incrementToken()) { Assert.assertTrue(i < expected.length, "got extra term: " + termAttr.toString()); Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i); i++; } Assert.assertEquals(i, expected.length, "not all tokens produced"); }
From source file:org.elasticsearch.test.unit.index.analysis.AnalysisTestsHelper.java
License:Apache License
public static void assertSimpleTSOutput(TokenStream stream, String[] expected, int[] posInc) throws IOException { stream.reset();//from www . ja va2 s .c om CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAttr = stream.getAttribute(PositionIncrementAttribute.class); Assert.assertNotNull(termAttr); int i = 0; while (stream.incrementToken()) { Assert.assertTrue(i < expected.length, "got extra term: " + termAttr.toString()); Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i); Assert.assertEquals(posIncAttr.getPositionIncrement(), posInc[i]); i++; } Assert.assertEquals(i, expected.length, "not all tokens produced"); }
From source file:org.exoplatform.services.jcr.impl.core.query.lucene.AbstractExcerpt.java
License:Apache License
/** * @param text the text./*from w w w . ja v a 2s. c om*/ * @return a <code>TermPositionVector</code> for the given text. */ private TermPositionVector createTermPositionVector(String text) { // term -> TermVectorOffsetInfo[] final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>(); Reader r = new StringReader(text); TokenStream ts = index.getTextAnalyzer().tokenStream("", r); try { while (ts.incrementToken()) { OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); String termText = new String(term.buffer(), 0, term.length()); TermVectorOffsetInfo[] info = termMap.get(termText); if (info == null) { info = new TermVectorOffsetInfo[1]; } else { TermVectorOffsetInfo[] tmp = info; info = new TermVectorOffsetInfo[tmp.length + 1]; System.arraycopy(tmp, 0, info, 0, tmp.length); } info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset()); termMap.put(termText, info); } ts.end(); ts.close(); } catch (IOException e) { // should never happen, we are reading from a string if (LOG.isTraceEnabled()) { LOG.trace("An exception occurred: " + e.getMessage()); } } return new TermPositionVector() { private String[] terms = termMap.keySet().toArray(new String[termMap.size()]); public int[] getTermPositions(int index) { return null; } public TermVectorOffsetInfo[] getOffsets(int index) { TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO; if (index >= 0 && index < terms.length) { info = termMap.get(terms[index]); } return info; } public String getField() { return ""; } public int size() { return terms.length; } public String[] getTerms() { return terms; } public int[] getTermFrequencies() { int[] freqs = new int[terms.length]; for (int i = 0; i < terms.length; i++) { freqs[i] = termMap.get(terms[i]).length; } return freqs; } public int indexOf(String term) { int res = Arrays.binarySearch(terms, term); return res >= 0 ? res : -1; } public int[] indexesOf(String[] terms, int start, int len) { int[] res = new int[len]; for (int i = 0; i < len; i++) { res[i] = indexOf(terms[i]); } return res; } }; }
From source file:org.exoplatform.services.jcr.impl.core.query.lucene.MoreLikeThis.java
License:Apache License
/** * Adds term frequencies found by tokenizing text from reader into the Map words * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis *///from w w w .j av a2 s.co m private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException { TokenStream ts = analyzer.tokenStream(fieldName, r); int tokenCount = 0; // for every token while (ts.incrementToken()) { CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); String word = new String(term.buffer(), 0, term.length()); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); ts.close(); }
From source file:org.fastcatsearch.ir.index.SearchIndexWriter.java
License:Apache License
private void indexValue(int docNo, int i, Object value, boolean isIgnoreCase, int positionIncrementGap) throws IOException, IRException { if (value == null) { return;/*from w w w .ja v a 2s.co m*/ } char[] fieldValue = value.toString().toCharArray(); TokenStream tokenStream = indexAnalyzerList[i].tokenStream(indexId, new CharArrayReader(fieldValue), indexingAnalyzerOption); tokenStream.reset(); CharsRefTermAttribute termAttribute = null; PositionIncrementAttribute positionAttribute = null; StopwordAttribute stopwordAttribute = null; AdditionalTermAttribute additionalTermAttribute = null; CharTermAttribute charTermAttribute = null; //? ? . if (tokenStream.hasAttribute(CharsRefTermAttribute.class)) { termAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class); } if (tokenStream.hasAttribute(PositionIncrementAttribute.class)) { positionAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); } if (tokenStream.hasAttribute(AdditionalTermAttribute.class)) { additionalTermAttribute = tokenStream.getAttribute(AdditionalTermAttribute.class); } // stopword . if (tokenStream.hasAttribute(StopwordAttribute.class)) { stopwordAttribute = tokenStream.getAttribute(StopwordAttribute.class); } if (tokenStream.hasAttribute(CharTermAttribute.class)) { charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); } int lastPosition = 0; while (tokenStream.incrementToken()) { CharVector key = null; if (termAttribute != null) { CharsRef charRef = termAttribute.charsRef(); char[] buffer = new char[charRef.length()]; System.arraycopy(charRef.chars, charRef.offset, buffer, 0, charRef.length); key = new CharVector(buffer, 0, buffer.length); } else { key = new CharVector(charTermAttribute.buffer(), 0, charTermAttribute.length()); } int position = -1; if (positionAttribute != null) { position = positionAttribute.getPositionIncrement() + positionIncrementGap; lastPosition = position; } // logger.debug("FIELD#{}: {} >> {} ({})", indexId, key, docNo, position); if (stopwordAttribute != null && stopwordAttribute.isStopword()) { //ignore } else { memoryPosting.add(key, docNo, position); } // if(synonymAttribute != null) { // CharVector[] synonym = synonymAttribute.getSynonym(); // if(synonym != null) { // for(CharVector token : synonym) { // memoryPosting.add(token, docNo, position); // } // } // } if (additionalTermAttribute != null && additionalTermAttribute.size() > 0) { Iterator<String> iter = additionalTermAttribute.iterateAdditionalTerms(); while (iter.hasNext()) { CharVector token = new CharVector(iter.next().toCharArray()); memoryPosting.add(token, docNo, lastPosition); } } } }