Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.zimbra.cs.index.analysis.UniversalAnalyzerTest.java

License:Open Source License

private void testCJK(String src) throws IOException {
    TokenStream cjk = cjkAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute cjkTermAttr = cjk.addAttribute(CharTermAttribute.class);
    OffsetAttribute cjkOffsetAttr = cjk.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute cjkPosIncAttr = cjk.addAttribute(PositionIncrementAttribute.class);

    TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
    OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);

    while (true) {
        boolean result = cjk.incrementToken();
        Assert.assertEquals(result, uni.incrementToken());
        if (!result) {
            break;
        }/*from  www  . j av  a  2  s .  c o  m*/
        String term = cjkTermAttr.toString();
        Assert.assertEquals(cjkTermAttr, uniTermAttr);
        if (assertOffset) {
            Assert.assertEquals(term, cjkOffsetAttr, uniOffsetAttr);
        }
        Assert.assertEquals(term, cjkPosIncAttr, uniPosIncAttr);
    }
}

From source file:com.zimbra.cs.index.query.ContactQuery.java

License:Open Source License

public ContactQuery(String text) {
    TokenStream stream = new ContactTokenFilter(
            new AddrCharTokenizer(new HalfwidthKanaVoicedMappingFilter(new StringReader(text))));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    try {//from   w ww  .j a  v  a2  s.  c o  m
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(CharMatcher.is('*').trimTrailingFrom(termAttr)); // remove trailing wildcard characters
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.query.TextQuery.java

License:Open Source License

TextQuery(TokenStream stream, String field, String text) {
    this.field = field;
    this.text = text;

    try {//from w w w .  ja  v  a 2 s.c om
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(termAttr.toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.TermInfo.java

License:Open Source License

/**
 * Update {@code term2info} with information from {@code field}
 *
 *  if the field from the Lucene document is indexed and tokenized, for each token:
 *      a)   construct a key based on the field name and info about the token
 *      b)   if {@code term2info} has an entry for that key, get it, otherwise create an entry
 *      c)   update the entry with position information for this token
 *
 * @param pos is the current position/*from w ww  .  j  a v a2s.co  m*/
 * @return new value for {@code pos}
 */
public static int updateMapWithDetailsForField(Analyzer analyzer, Fieldable field,
        Map<String, TermInfo> term2info, int pos) throws IOException {
    if (!field.isIndexed()) {
        return pos;
    }
    Character prefix = LuceneFields.FIELD2PREFIX.get(field.name());
    if (prefix == null) {
        ZimbraLog.index.info("TermInfo.updateMapWithDetailsForField - skipping indexed field " + field.name()
                + " isTokenized=" + field.isTokenized());
        return pos;
    }
    if (field.isTokenized()) {
        TokenStream stream = field.tokenStreamValue();
        if (stream == null) {
            stream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
        }
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            if (termAttr.length() == 0) {
                continue;
            }
            String term = prefix + termAttr.toString();
            TermInfo info = term2info.get(term);
            if (info == null) {
                info = new TermInfo();
                term2info.put(term, info);
            }
            pos += posAttr.getPositionIncrement();
            info.addPosition(pos);
        }
    } else {
        // whole field is the only "token".  Info potentially getting stored twice - here as well as where
        // the field is stored.
        String term = prefix + field.stringValue();
        TermInfo info = term2info.get(term);
        if (info == null) {
            info = new TermInfo();
            term2info.put(term, info);
        }
    }
    return pos;
}

From source file:com.zimbra.cs.index.ZimbraAnalyzer.java

License:Open Source License

public static String getAllTokensConcatenated(String fieldName, Reader reader) {
    StringBuilder toReturn = new StringBuilder();

    TokenStream stream = SINGLETON.tokenStream(fieldName, reader);
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);

    try {/*from ww  w .j  a v  a  2s. c  o m*/
        stream.reset();
        while (stream.incrementToken()) {
            toReturn.append(term);
            toReturn.append(' ');
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        e.printStackTrace(); //otherwise eat it
    }

    return toReturn.toString();
}

From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java

License:Open Source License

/**
 * We intentionally disable the positionIncrement because we want phrases to match across removed stop words.
 *
 * @see PositionIncrementAttribute/* www.jav a2s . co  m*/
 */
@Test
public void positionIncrement() throws Exception {
    TokenStream stream = ZimbraAnalyzer.getInstance().tokenStream(LuceneFields.L_H_SUBJECT,
            new StringReader("It's a test."));
    PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        Assert.assertEquals(posIncrAtt.getPositionIncrement(), 1);
    }
    stream.end();
    stream.close();
}

From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java

License:Open Source License

public static List<String> toTokens(TokenStream stream) throws IOException {
    List<String> result = new ArrayList<String>();
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*  ww w.  ja  v a  2 s . c  om*/
    while (stream.incrementToken()) {
        result.add(termAttr.toString());
    }
    stream.end();
    return result;
}

From source file:com._4dconcept.lucene.highlighter.GenericHighlighter.java

License:Apache License

public void highlight(String toHighlight, String field) throws IOException, ParseException {

    TokenStream tokenStream = analyzer.reusableTokenStream(field, new StringReader(toHighlight));
    QueryTermScorer queryTermScorer = new QueryTermScorer(query);

    TokenStream newStream = queryTermScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;//ww  w  . j  a  va2 s.  c  o m
    }

    //tokenStream.addAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();

    queryTermScorer.startFragment(null);

    int lastEndOffset = 0;

    TokenGroup tokenGroup = new TokenGroup(tokenStream);

    for (boolean next = tokenStream.incrementToken(); next; next = tokenStream.incrementToken()) {

        if ((tokenGroup.numTokens > 0) && tokenGroup.isDistinct()) {
            lastEndOffset = extractText(tokenGroup, toHighlight, lastEndOffset);
        }
        tokenGroup.addToken(queryTermScorer.getTokenScore());
    }

    if (tokenGroup.numTokens > 0) {
        lastEndOffset = extractText(tokenGroup, toHighlight, lastEndOffset);
    }

    //Test what remains of the original text beyond the point where we stopped analyzing
    if ((lastEndOffset < toHighlight.length())) {
        //append it to the last fragment
        callback.terms(toHighlight.substring(lastEndOffset), lastEndOffset, tokenGroup.getTotalScore());
    }
}

From source file:CopulaResources.TermCooccurence.java

private static List tokenizeString(Analyzer analyzer, String str) {
    List result = new ArrayList<>();
    try {//w  ww .  ja v  a2 s .  c om
        TokenStream stream = analyzer.tokenStream(null, new StringReader(str));
        stream.reset();
        while (stream.incrementToken())
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:crawler.DataSearch.java

License:Apache License

private boolean trash(String tweet) {
    Analyzer analyzer = new TwitterAnalyzer();

    StringReader in = new StringReader(tweet);
    TokenStream ts = analyzer.tokenStream("body", in);
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);
    int totalCount = 0, hashCount = 0;
    //Vector v = new SequentialAccessSparseVector(100);                   
    try {//from   w ww.  j  av  a  2 s.com
        while (ts.incrementToken()) {
            char[] termBuffer = termAtt.termBuffer();
            int termLen = termAtt.termLength();

            //System.out.println(w);
            if (typeAttr.type().equals("hashtag"))
                hashCount++;
            totalCount++;
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    if (totalCount - hashCount <= 1)
        return true;
    else
        return false;
}