Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:com.zimbra.cs.index.analysis.UniversalAnalyzerTest.java

License:Open Source License

private void testSTD(String src) throws IOException {
    TokenStream std = standardAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute stdTermAttr = std.addAttribute(CharTermAttribute.class);
    OffsetAttribute stdOffsetAttr = std.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute stdPosIncAttr = std.addAttribute(PositionIncrementAttribute.class);

    TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
    OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);

    while (true) {
        boolean result = std.incrementToken();
        Assert.assertEquals(result, uni.incrementToken());
        if (!result) {
            break;
        }/*from  ww  w  . ja v  a  2 s .  c om*/
        String term = stdTermAttr.toString();
        Assert.assertEquals(stdTermAttr, uniTermAttr);
        if (assertOffset) {
            Assert.assertEquals(term, stdOffsetAttr, uniOffsetAttr);
        }
        Assert.assertEquals(term, stdPosIncAttr, uniPosIncAttr);
    }
}

From source file:com.zimbra.cs.index.analysis.UniversalAnalyzerTest.java

License:Open Source License

private void testCJK(String src) throws IOException {
    TokenStream cjk = cjkAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute cjkTermAttr = cjk.addAttribute(CharTermAttribute.class);
    OffsetAttribute cjkOffsetAttr = cjk.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute cjkPosIncAttr = cjk.addAttribute(PositionIncrementAttribute.class);

    TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
    OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);

    while (true) {
        boolean result = cjk.incrementToken();
        Assert.assertEquals(result, uni.incrementToken());
        if (!result) {
            break;
        }//from   ww  w  . ja  v  a 2s . co  m
        String term = cjkTermAttr.toString();
        Assert.assertEquals(cjkTermAttr, uniTermAttr);
        if (assertOffset) {
            Assert.assertEquals(term, cjkOffsetAttr, uniOffsetAttr);
        }
        Assert.assertEquals(term, cjkPosIncAttr, uniPosIncAttr);
    }
}

From source file:com.zimbra.cs.index.query.ContactQuery.java

License:Open Source License

public ContactQuery(String text) {
    TokenStream stream = new ContactTokenFilter(
            new AddrCharTokenizer(new HalfwidthKanaVoicedMappingFilter(new StringReader(text))));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    try {// w  w  w  . j a  va2  s.  co  m
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(CharMatcher.is('*').trimTrailingFrom(termAttr)); // remove trailing wildcard characters
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.query.TextQuery.java

License:Open Source License

TextQuery(TokenStream stream, String field, String text) {
    this.field = field;
    this.text = text;

    try {//ww w.jav a2s .  co m
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(termAttr.toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.TermInfo.java

License:Open Source License

/**
 * Update {@code term2info} with information from {@code field}
 *
 *  if the field from the Lucene document is indexed and tokenized, for each token:
 *      a)   construct a key based on the field name and info about the token
 *      b)   if {@code term2info} has an entry for that key, get it, otherwise create an entry
 *      c)   update the entry with position information for this token
 *
 * @param pos is the current position//from   w  w w .ja va 2  s  . com
 * @return new value for {@code pos}
 */
public static int updateMapWithDetailsForField(Analyzer analyzer, Fieldable field,
        Map<String, TermInfo> term2info, int pos) throws IOException {
    if (!field.isIndexed()) {
        return pos;
    }
    Character prefix = LuceneFields.FIELD2PREFIX.get(field.name());
    if (prefix == null) {
        ZimbraLog.index.info("TermInfo.updateMapWithDetailsForField - skipping indexed field " + field.name()
                + " isTokenized=" + field.isTokenized());
        return pos;
    }
    if (field.isTokenized()) {
        TokenStream stream = field.tokenStreamValue();
        if (stream == null) {
            stream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
        }
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            if (termAttr.length() == 0) {
                continue;
            }
            String term = prefix + termAttr.toString();
            TermInfo info = term2info.get(term);
            if (info == null) {
                info = new TermInfo();
                term2info.put(term, info);
            }
            pos += posAttr.getPositionIncrement();
            info.addPosition(pos);
        }
    } else {
        // whole field is the only "token".  Info potentially getting stored twice - here as well as where
        // the field is stored.
        String term = prefix + field.stringValue();
        TermInfo info = term2info.get(term);
        if (info == null) {
            info = new TermInfo();
            term2info.put(term, info);
        }
    }
    return pos;
}

From source file:com.zimbra.cs.index.ZimbraAnalyzer.java

License:Open Source License

public static String getAllTokensConcatenated(String fieldName, Reader reader) {
    StringBuilder toReturn = new StringBuilder();

    TokenStream stream = SINGLETON.tokenStream(fieldName, reader);
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);

    try {//from ww  w .  ja v  a 2  s  .  c  om
        stream.reset();
        while (stream.incrementToken()) {
            toReturn.append(term);
            toReturn.append(' ');
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        e.printStackTrace(); //otherwise eat it
    }

    return toReturn.toString();
}

From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java

License:Open Source License

/**
 * We intentionally disable the positionIncrement because we want phrases to match across removed stop words.
 *
 * @see PositionIncrementAttribute/*from w w w  .j  a v a  2s .com*/
 */
@Test
public void positionIncrement() throws Exception {
    TokenStream stream = ZimbraAnalyzer.getInstance().tokenStream(LuceneFields.L_H_SUBJECT,
            new StringReader("It's a test."));
    PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        Assert.assertEquals(posIncrAtt.getPositionIncrement(), 1);
    }
    stream.end();
    stream.close();
}

From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java

License:Open Source License

public static List<String> toTokens(TokenStream stream) throws IOException {
    List<String> result = new ArrayList<String>();
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from  ww w.  j a v a2s  .c  o  m
    while (stream.incrementToken()) {
        result.add(termAttr.toString());
    }
    stream.end();
    return result;
}

From source file:com._4dconcept.lucene.highlighter.TokenGroup.java

License:Apache License

public TokenGroup(TokenStream tokenStream) {
    offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    termAtt = tokenStream.addAttribute(CharTermAttribute.class);
}

From source file:crawler.DataSearch.java

License:Apache License

private boolean trash(String tweet) {
    Analyzer analyzer = new TwitterAnalyzer();

    StringReader in = new StringReader(tweet);
    TokenStream ts = analyzer.tokenStream("body", in);
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);
    int totalCount = 0, hashCount = 0;
    //Vector v = new SequentialAccessSparseVector(100);                   
    try {//from  w  ww . ja  v a  2 s.  co m
        while (ts.incrementToken()) {
            char[] termBuffer = termAtt.termBuffer();
            int termLen = termAtt.termLength();

            //System.out.println(w);
            if (typeAttr.type().equals("hashtag"))
                hashCount++;
            totalCount++;
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    if (totalCount - hashCount <= 1)
        return true;
    else
        return false;
}