Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.zimbra.cs.index.analysis.UniversalAnalyzerTest.java

License:Open Source License

private void testCJK(String src) throws IOException {
    TokenStream cjk = cjkAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute cjkTermAttr = cjk.addAttribute(CharTermAttribute.class);
    OffsetAttribute cjkOffsetAttr = cjk.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute cjkPosIncAttr = cjk.addAttribute(PositionIncrementAttribute.class);

    TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
    OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);

    while (true) {
        boolean result = cjk.incrementToken();
        Assert.assertEquals(result, uni.incrementToken());
        if (!result) {
            break;
        }/*from  www  . j av  a  2  s .  c o  m*/
        String term = cjkTermAttr.toString();
        Assert.assertEquals(cjkTermAttr, uniTermAttr);
        if (assertOffset) {
            Assert.assertEquals(term, cjkOffsetAttr, uniOffsetAttr);
        }
        Assert.assertEquals(term, cjkPosIncAttr, uniPosIncAttr);
    }
}

From source file:com.zimbra.cs.index.query.ContactQuery.java

License:Open Source License

public ContactQuery(String text) {
    TokenStream stream = new ContactTokenFilter(
            new AddrCharTokenizer(new HalfwidthKanaVoicedMappingFilter(new StringReader(text))));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    try {//from   w ww  .j a  v  a2  s.  c o  m
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(CharMatcher.is('*').trimTrailingFrom(termAttr)); // remove trailing wildcard characters
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.query.TextQuery.java

License:Open Source License

TextQuery(TokenStream stream, String field, String text) {
    this.field = field;
    this.text = text;

    try {//from w w w .  ja  v  a 2 s.c om
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(termAttr.toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.TermInfo.java

License:Open Source License

/**
 * Update {@code term2info} with information from {@code field}
 *
 *  if the field from the Lucene document is indexed and tokenized, for each token:
 *      a)   construct a key based on the field name and info about the token
 *      b)   if {@code term2info} has an entry for that key, get it, otherwise create an entry
 *      c)   update the entry with position information for this token
 *
 * @param pos is the current position/*from w ww  .  j  a v a2s.co  m*/
 * @return new value for {@code pos}
 */
public static int updateMapWithDetailsForField(Analyzer analyzer, Fieldable field,
        Map<String, TermInfo> term2info, int pos) throws IOException {
    if (!field.isIndexed()) {
        return pos;
    }
    Character prefix = LuceneFields.FIELD2PREFIX.get(field.name());
    if (prefix == null) {
        ZimbraLog.index.info("TermInfo.updateMapWithDetailsForField - skipping indexed field " + field.name()
                + " isTokenized=" + field.isTokenized());
        return pos;
    }
    if (field.isTokenized()) {
        TokenStream stream = field.tokenStreamValue();
        if (stream == null) {
            stream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
        }
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            if (termAttr.length() == 0) {
                continue;
            }
            String term = prefix + termAttr.toString();
            TermInfo info = term2info.get(term);
            if (info == null) {
                info = new TermInfo();
                term2info.put(term, info);
            }
            pos += posAttr.getPositionIncrement();
            info.addPosition(pos);
        }
    } else {
        // whole field is the only "token".  Info potentially getting stored twice - here as well as where
        // the field is stored.
        String term = prefix + field.stringValue();
        TermInfo info = term2info.get(term);
        if (info == null) {
            info = new TermInfo();
            term2info.put(term, info);
        }
    }
    return pos;
}

From source file:com.zimbra.cs.index.ZimbraAnalyzer.java

License:Open Source License

public static String getAllTokensConcatenated(String fieldName, Reader reader) {
    StringBuilder toReturn = new StringBuilder();

    TokenStream stream = SINGLETON.tokenStream(fieldName, reader);
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);

    try {/*from ww  w .j  a v  a  2s. c  o m*/
        stream.reset();
        while (stream.incrementToken()) {
            toReturn.append(term);
            toReturn.append(' ');
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        e.printStackTrace(); //otherwise eat it
    }

    return toReturn.toString();
}

From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java

License:Open Source License

/**
 * We intentionally disable the positionIncrement because we want phrases to match across removed stop words.
 *
 * @see PositionIncrementAttribute/* www.jav a2s . co  m*/
 */
@Test
public void positionIncrement() throws Exception {
    TokenStream stream = ZimbraAnalyzer.getInstance().tokenStream(LuceneFields.L_H_SUBJECT,
            new StringReader("It's a test."));
    PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        Assert.assertEquals(posIncrAtt.getPositionIncrement(), 1);
    }
    stream.end();
    stream.close();
}

From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java

License:Open Source License

public static List<String> toTokens(TokenStream stream) throws IOException {
    List<String> result = new ArrayList<String>();
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*  ww w.  ja  v a  2 s . c  om*/
    while (stream.incrementToken()) {
        result.add(termAttr.toString());
    }
    stream.end();
    return result;
}

From source file:com._4dconcept.lucene.highlighter.GenericHighlighter.java

License:Apache License

public void highlight(String toHighlight, String field) throws IOException, ParseException {

    TokenStream tokenStream = analyzer.reusableTokenStream(field, new StringReader(toHighlight));
    QueryTermScorer queryTermScorer = new QueryTermScorer(query);

    TokenStream newStream = queryTermScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;//ww  w  . j  a  va2 s.  c  o m
    }

    //tokenStream.addAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();

    queryTermScorer.startFragment(null);

    int lastEndOffset = 0;

    TokenGroup tokenGroup = new TokenGroup(tokenStream);

    for (boolean next = tokenStream.incrementToken(); next; next = tokenStream.incrementToken()) {

        if ((tokenGroup.numTokens > 0) && tokenGroup.isDistinct()) {
            lastEndOffset = extractText(tokenGroup, toHighlight, lastEndOffset);
        }
        tokenGroup.addToken(queryTermScorer.getTokenScore());
    }

    if (tokenGroup.numTokens > 0) {
        lastEndOffset = extractText(tokenGroup, toHighlight, lastEndOffset);
    }

    //Test what remains of the original text beyond the point where we stopped analyzing
    if ((lastEndOffset < toHighlight.length())) {
        //append it to the last fragment
        callback.terms(toHighlight.substring(lastEndOffset), lastEndOffset, tokenGroup.getTotalScore());
    }
}

From source file:CopulaResources.TermCooccurence.java

private static List tokenizeString(Analyzer analyzer, String str) {
    List result = new ArrayList<>();
    try {//w  ww .  ja v  a2 s .  c om
        TokenStream stream = analyzer.tokenStream(null, new StringReader(str));
        stream.reset();
        while (stream.incrementToken())
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:crawler.DataSearch.java

License:Apache License

private boolean trash(String tweet) {
    Analyzer analyzer = new TwitterAnalyzer();

    StringReader in = new StringReader(tweet);
    TokenStream ts = analyzer.tokenStream("body", in);
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);
    int totalCount = 0, hashCount = 0;
    //Vector v = new SequentialAccessSparseVector(100);                   
    try {//from   w ww.  j  av  a  2 s.com
        while (ts.incrementToken()) {
            char[] termBuffer = termAtt.termBuffer();
            int termLen = termAtt.termLength();

            //System.out.println(w);
            if (typeAttr.type().equals("hashtag"))
                hashCount++;
            totalCount++;
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    if (totalCount - hashCount <= 1)
        return true;
    else
        return false;
}