Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Query wildcardQuery(String value, @Nullable MultiTermQuery.RewriteMethod method,
        @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real query
    TokenStream tok = null;
    try {//from w  ww  .  ja v a2 s. c o  m
        tok = searchAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanQuery q = new BooleanQuery();
    try {
        while (tok.incrementToken()) {
            q.add(new WildcardQuery(names().createIndexNameTerm(termAtt.toString()), wildcardOne, wildcardAny),
                    BooleanClause.Occur.MUST);
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        q = null;
    }
    return q;
}

From source file:org.elasticsearch.index.mapper.hashsplitter.HashSplitterFieldMapper.java

License:Apache License

@Override
public Filter wildcardFilter(String value, @Nullable QueryParseContext context) {
    // Use HashSplitterSearch* analysis and post-process it to create the real query
    TokenStream tok = null;
    try {/* w  w w.  j a  v a 2s. c  om*/
        tok = searchAnalyzer.reusableTokenStream(names().indexNameClean(), new FastStringReader(value));
        tok.reset();
    } catch (IOException e) {
        return null;
    }
    CharTermAttribute termAtt = tok.getAttribute(CharTermAttribute.class);
    BooleanFilter f = new BooleanFilter();
    try {
        while (tok.incrementToken()) {
            f.add(new WildcardFilter(names().createIndexNameTerm(termAtt.toString()), wildcardOne, wildcardAny),
                    BooleanClause.Occur.MUST);
        }
        tok.end();
        tok.close();
    } catch (IOException e) {
        e.printStackTrace();
        f = null;
    }
    return f;
}

From source file:org.elasticsearch.search.highlight.PlainHighlighter.java

License:Apache License

private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, TokenStream tokenStream)
        throws IOException {
    try {/*www.j  av a 2s  .c om*/
        if (!tokenStream.hasAttribute(OffsetAttribute.class)) {
            // Can't split on term boundaries without offsets
            return -1;
        }
        int end = -1;
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class);
            if (attr.endOffset() >= noMatchSize) {
                // Jump to the end of this token if it wouldn't put us past the boundary
                if (attr.endOffset() == noMatchSize) {
                    end = noMatchSize;
                }
                return end;
            }
            end = attr.endOffset();
        }
        // We've exhausted the token stream so we should just highlight everything.
        return end;
    } finally {
        tokenStream.end();
        tokenStream.close();
    }
}

From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTest.java

License:Apache License

@Test
public void testSuggestTokenFilterProperlyDelegateInputStream() throws Exception {
    TokenStream tokenStream = new MockTokenizer(new StringReader("mykeyword"), MockTokenizer.WHITESPACE, true);
    BytesRef payload = new BytesRef("Surface keyword|friggin payload|10");
    TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter(
            new CompletionTokenStream(tokenStream, payload, new CompletionTokenStream.ToFiniteStrings() {
                @Override//from  ww  w . j  a v a2  s  . c  o  m
                public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
                    return suggester.toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream);
                }
            }));
    TermToBytesRefAttribute termAtt = suggestTokenStream.getAttribute(TermToBytesRefAttribute.class);
    BytesRef ref = termAtt.getBytesRef();
    assertNotNull(ref);
    suggestTokenStream.reset();

    while (suggestTokenStream.incrementToken()) {
        termAtt.fillBytesRef();
        assertThat(ref.utf8ToString(), equalTo("mykeyword"));
    }
    suggestTokenStream.end();
    suggestTokenStream.close();
}

From source file:org.elasticsearch.search.suggest.CompletionTokenStreamTests.java

License:Apache License

@Test
public void testSuggestTokenFilterProperlyDelegateInputStream() throws Exception {
    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    tokenizer.setReader(new StringReader("mykeyword"));
    BytesRef payload = new BytesRef("Surface keyword|friggin payload|10");
    TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter(
            new CompletionTokenStream(tokenizer, payload, new CompletionTokenStream.ToFiniteStrings() {
                @Override//from   ww w .j  a va  2  s.  co m
                public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
                    return suggester.toFiniteStrings(stream);
                }
            }));
    TermToBytesRefAttribute termAtt = suggestTokenStream.getAttribute(TermToBytesRefAttribute.class);
    assertNotNull(termAtt.getBytesRef());
    suggestTokenStream.reset();

    while (suggestTokenStream.incrementToken()) {
        assertThat(termAtt.getBytesRef().utf8ToString(), equalTo("mykeyword"));
    }
    suggestTokenStream.end();
    suggestTokenStream.close();
}

From source file:org.elasticsearch.test.unit.index.analysis.AnalysisTestsHelper.java

License:Apache License

public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();/*w ww.  j ava  2  s .  c o m*/
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    Assert.assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        Assert.assertTrue(i < expected.length, "got extra term: " + termAttr.toString());
        Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i);
        i++;
    }
    Assert.assertEquals(i, expected.length, "not all tokens produced");
}

From source file:org.elasticsearch.test.unit.index.analysis.AnalysisTestsHelper.java

License:Apache License

public static void assertSimpleTSOutput(TokenStream stream, String[] expected, int[] posInc)
        throws IOException {
    stream.reset();//from   www . ja va2 s  .c  om
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAttr = stream.getAttribute(PositionIncrementAttribute.class);
    Assert.assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        Assert.assertTrue(i < expected.length, "got extra term: " + termAttr.toString());
        Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i);
        Assert.assertEquals(posIncAttr.getPositionIncrement(), posInc[i]);
        i++;
    }
    Assert.assertEquals(i, expected.length, "not all tokens produced");
}

From source file:org.exoplatform.services.jcr.impl.core.query.lucene.AbstractExcerpt.java

License:Apache License

/**
 * @param text the text./*from   w w w .  ja v  a  2s. c  om*/
 * @return a <code>TermPositionVector</code> for the given text.
 */
private TermPositionVector createTermPositionVector(String text) {
    // term -> TermVectorOffsetInfo[]
    final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
    Reader r = new StringReader(text);
    TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
    try {
        while (ts.incrementToken()) {
            OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
            CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
            String termText = new String(term.buffer(), 0, term.length());
            TermVectorOffsetInfo[] info = termMap.get(termText);
            if (info == null) {
                info = new TermVectorOffsetInfo[1];
            } else {
                TermVectorOffsetInfo[] tmp = info;
                info = new TermVectorOffsetInfo[tmp.length + 1];
                System.arraycopy(tmp, 0, info, 0, tmp.length);
            }
            info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
            termMap.put(termText, info);
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
        // should never happen, we are reading from a string
        if (LOG.isTraceEnabled()) {
            LOG.trace("An exception occurred: " + e.getMessage());
        }
    }

    return new TermPositionVector() {

        private String[] terms = termMap.keySet().toArray(new String[termMap.size()]);

        public int[] getTermPositions(int index) {
            return null;
        }

        public TermVectorOffsetInfo[] getOffsets(int index) {
            TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
            if (index >= 0 && index < terms.length) {
                info = termMap.get(terms[index]);
            }
            return info;
        }

        public String getField() {
            return "";
        }

        public int size() {
            return terms.length;
        }

        public String[] getTerms() {
            return terms;
        }

        public int[] getTermFrequencies() {
            int[] freqs = new int[terms.length];
            for (int i = 0; i < terms.length; i++) {
                freqs[i] = termMap.get(terms[i]).length;
            }
            return freqs;
        }

        public int indexOf(String term) {
            int res = Arrays.binarySearch(terms, term);
            return res >= 0 ? res : -1;
        }

        public int[] indexesOf(String[] terms, int start, int len) {
            int[] res = new int[len];
            for (int i = 0; i < len; i++) {
                res[i] = indexOf(terms[i]);
            }
            return res;
        }
    };
}

From source file:org.exoplatform.services.jcr.impl.core.query.lucene.MoreLikeThis.java

License:Apache License

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *///from   w w w  .j  av  a2  s.co m
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    int tokenCount = 0;
    // for every token
    while (ts.incrementToken()) {
        CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
        String word = new String(term.buffer(), 0, term.length());
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
            break;
        }
        if (isNoiseWord(word)) {
            continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
            termFreqMap.put(word, new Int());
        } else {
            cnt.x++;
        }
    }
    ts.end();
    ts.close();
}

From source file:org.fastcatsearch.ir.index.SearchIndexWriter.java

License:Apache License

private void indexValue(int docNo, int i, Object value, boolean isIgnoreCase, int positionIncrementGap)
        throws IOException, IRException {
    if (value == null) {
        return;/*from w  w  w .ja  v  a 2s.co m*/
    }
    char[] fieldValue = value.toString().toCharArray();
    TokenStream tokenStream = indexAnalyzerList[i].tokenStream(indexId, new CharArrayReader(fieldValue),
            indexingAnalyzerOption);
    tokenStream.reset();
    CharsRefTermAttribute termAttribute = null;
    PositionIncrementAttribute positionAttribute = null;
    StopwordAttribute stopwordAttribute = null;
    AdditionalTermAttribute additionalTermAttribute = null;
    CharTermAttribute charTermAttribute = null;
    //? ?  .

    if (tokenStream.hasAttribute(CharsRefTermAttribute.class)) {
        termAttribute = tokenStream.getAttribute(CharsRefTermAttribute.class);
    }
    if (tokenStream.hasAttribute(PositionIncrementAttribute.class)) {
        positionAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);
    }
    if (tokenStream.hasAttribute(AdditionalTermAttribute.class)) {
        additionalTermAttribute = tokenStream.getAttribute(AdditionalTermAttribute.class);
    }

    // stopword .
    if (tokenStream.hasAttribute(StopwordAttribute.class)) {
        stopwordAttribute = tokenStream.getAttribute(StopwordAttribute.class);
    }
    if (tokenStream.hasAttribute(CharTermAttribute.class)) {
        charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    }

    int lastPosition = 0;

    while (tokenStream.incrementToken()) {
        CharVector key = null;
        if (termAttribute != null) {
            CharsRef charRef = termAttribute.charsRef();
            char[] buffer = new char[charRef.length()];
            System.arraycopy(charRef.chars, charRef.offset, buffer, 0, charRef.length);
            key = new CharVector(buffer, 0, buffer.length);
        } else {
            key = new CharVector(charTermAttribute.buffer(), 0, charTermAttribute.length());
        }

        int position = -1;
        if (positionAttribute != null) {
            position = positionAttribute.getPositionIncrement() + positionIncrementGap;
            lastPosition = position;
        }
        //         logger.debug("FIELD#{}: {} >> {} ({})", indexId, key, docNo, position);
        if (stopwordAttribute != null && stopwordAttribute.isStopword()) {
            //ignore
        } else {
            memoryPosting.add(key, docNo, position);
        }
        //         if(synonymAttribute != null) {
        //            CharVector[] synonym = synonymAttribute.getSynonym();
        //            if(synonym != null) {
        //               for(CharVector token : synonym) {
        //                  memoryPosting.add(token, docNo, position);
        //               }
        //            }
        //         }
        if (additionalTermAttribute != null && additionalTermAttribute.size() > 0) {
            Iterator<String> iter = additionalTermAttribute.iterateAdditionalTerms();
            while (iter.hasNext()) {
                CharVector token = new CharVector(iter.next().toCharArray());
                memoryPosting.add(token, docNo, lastPosition);
            }
        }
    }
}