Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:com.zimbra.cs.index.analysis.UniversalAnalyzerTest.java

License:Open Source License

private void testSTD(String src) throws IOException {
    TokenStream std = standardAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute stdTermAttr = std.addAttribute(CharTermAttribute.class);
    OffsetAttribute stdOffsetAttr = std.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute stdPosIncAttr = std.addAttribute(PositionIncrementAttribute.class);

    TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
    OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);

    while (true) {
        boolean result = std.incrementToken();
        Assert.assertEquals(result, uni.incrementToken());
        if (!result) {
            break;
        }/*from  ww  w  . ja v  a  2 s .  c om*/
        String term = stdTermAttr.toString();
        Assert.assertEquals(stdTermAttr, uniTermAttr);
        if (assertOffset) {
            Assert.assertEquals(term, stdOffsetAttr, uniOffsetAttr);
        }
        Assert.assertEquals(term, stdPosIncAttr, uniPosIncAttr);
    }
}

From source file:com.zimbra.cs.index.analysis.UniversalAnalyzerTest.java

License:Open Source License

private void testCJK(String src) throws IOException {
    TokenStream cjk = cjkAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute cjkTermAttr = cjk.addAttribute(CharTermAttribute.class);
    OffsetAttribute cjkOffsetAttr = cjk.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute cjkPosIncAttr = cjk.addAttribute(PositionIncrementAttribute.class);

    TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src));
    CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class);
    OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class);

    while (true) {
        boolean result = cjk.incrementToken();
        Assert.assertEquals(result, uni.incrementToken());
        if (!result) {
            break;
        }//from   ww  w  . ja  v  a 2s . co  m
        String term = cjkTermAttr.toString();
        Assert.assertEquals(cjkTermAttr, uniTermAttr);
        if (assertOffset) {
            Assert.assertEquals(term, cjkOffsetAttr, uniOffsetAttr);
        }
        Assert.assertEquals(term, cjkPosIncAttr, uniPosIncAttr);
    }
}

From source file:com.zimbra.cs.index.query.ContactQuery.java

License:Open Source License

public ContactQuery(String text) {
    TokenStream stream = new ContactTokenFilter(
            new AddrCharTokenizer(new HalfwidthKanaVoicedMappingFilter(new StringReader(text))));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    try {// w  w  w  . j a  va2  s.  co  m
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(CharMatcher.is('*').trimTrailingFrom(termAttr)); // remove trailing wildcard characters
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.query.TextQuery.java

License:Open Source License

TextQuery(TokenStream stream, String field, String text) {
    this.field = field;
    this.text = text;

    try {//ww w.jav a2s .  co m
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(termAttr.toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.TermInfo.java

License:Open Source License

/**
 * Update {@code term2info} with information from {@code field}
 *
 *  if the field from the Lucene document is indexed and tokenized, for each token:
 *      a)   construct a key based on the field name and info about the token
 *      b)   if {@code term2info} has an entry for that key, get it, otherwise create an entry
 *      c)   update the entry with position information for this token
 *
 * @param pos is the current position//from   w  w w .ja va 2  s  . com
 * @return new value for {@code pos}
 */
public static int updateMapWithDetailsForField(Analyzer analyzer, Fieldable field,
        Map<String, TermInfo> term2info, int pos) throws IOException {
    if (!field.isIndexed()) {
        return pos;
    }
    Character prefix = LuceneFields.FIELD2PREFIX.get(field.name());
    if (prefix == null) {
        ZimbraLog.index.info("TermInfo.updateMapWithDetailsForField - skipping indexed field " + field.name()
                + " isTokenized=" + field.isTokenized());
        return pos;
    }
    if (field.isTokenized()) {
        TokenStream stream = field.tokenStreamValue();
        if (stream == null) {
            stream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
        }
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            if (termAttr.length() == 0) {
                continue;
            }
            String term = prefix + termAttr.toString();
            TermInfo info = term2info.get(term);
            if (info == null) {
                info = new TermInfo();
                term2info.put(term, info);
            }
            pos += posAttr.getPositionIncrement();
            info.addPosition(pos);
        }
    } else {
        // whole field is the only "token".  Info potentially getting stored twice - here as well as where
        // the field is stored.
        String term = prefix + field.stringValue();
        TermInfo info = term2info.get(term);
        if (info == null) {
            info = new TermInfo();
            term2info.put(term, info);
        }
    }
    return pos;
}

From source file:com.zimbra.cs.index.ZimbraAnalyzer.java

License:Open Source License

public static String getAllTokensConcatenated(String fieldName, Reader reader) {
    StringBuilder toReturn = new StringBuilder();

    TokenStream stream = SINGLETON.tokenStream(fieldName, reader);
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);

    try {//from ww  w .  ja v  a 2  s  .  c  om
        stream.reset();
        while (stream.incrementToken()) {
            toReturn.append(term);
            toReturn.append(' ');
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        e.printStackTrace(); //otherwise eat it
    }

    return toReturn.toString();
}

From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java

License:Open Source License

/**
 * We intentionally disable the positionIncrement because we want phrases to match across removed stop words.
 *
 * @see PositionIncrementAttribute/*from w w w  .j  a v a  2s .com*/
 */
@Test
public void positionIncrement() throws Exception {
    TokenStream stream = ZimbraAnalyzer.getInstance().tokenStream(LuceneFields.L_H_SUBJECT,
            new StringReader("It's a test."));
    PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        Assert.assertEquals(posIncrAtt.getPositionIncrement(), 1);
    }
    stream.end();
    stream.close();
}

From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java

License:Open Source License

public static List<String> toTokens(TokenStream stream) throws IOException {
    List<String> result = new ArrayList<String>();
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from  ww w.  j a v a2s  .c  o  m
    while (stream.incrementToken()) {
        result.add(termAttr.toString());
    }
    stream.end();
    return result;
}

From source file:com._4dconcept.lucene.highlighter.TokenGroup.java

License:Apache License

public TokenGroup(TokenStream tokenStream) {
    offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    termAtt = tokenStream.addAttribute(CharTermAttribute.class);
}

From source file:crawler.DataSearch.java

License:Apache License

private boolean trash(String tweet) {
    Analyzer analyzer = new TwitterAnalyzer();

    StringReader in = new StringReader(tweet);
    TokenStream ts = analyzer.tokenStream("body", in);
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
    TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);
    int totalCount = 0, hashCount = 0;
    //Vector v = new SequentialAccessSparseVector(100);                   
    try {//from  w  ww . ja  v a  2 s.  co m
        while (ts.incrementToken()) {
            char[] termBuffer = termAtt.termBuffer();
            int termLen = termAtt.termLength();

            //System.out.println(w);
            if (typeAttr.type().equals("hashtag"))
                hashCount++;
            totalCount++;
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    if (totalCount - hashCount <= 1)
        return true;
    else
        return false;
}