List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:com.zimbra.cs.index.analysis.UniversalAnalyzerTest.java
License:Open Source License
private void testSTD(String src) throws IOException { TokenStream std = standardAnalyzer.tokenStream(null, new StringReader(src)); CharTermAttribute stdTermAttr = std.addAttribute(CharTermAttribute.class); OffsetAttribute stdOffsetAttr = std.addAttribute(OffsetAttribute.class); PositionIncrementAttribute stdPosIncAttr = std.addAttribute(PositionIncrementAttribute.class); TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src)); CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class); OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class); PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class); while (true) { boolean result = std.incrementToken(); Assert.assertEquals(result, uni.incrementToken()); if (!result) { break; }/*from ww w . ja v a 2 s . c om*/ String term = stdTermAttr.toString(); Assert.assertEquals(stdTermAttr, uniTermAttr); if (assertOffset) { Assert.assertEquals(term, stdOffsetAttr, uniOffsetAttr); } Assert.assertEquals(term, stdPosIncAttr, uniPosIncAttr); } }
From source file:com.zimbra.cs.index.analysis.UniversalAnalyzerTest.java
License:Open Source License
private void testCJK(String src) throws IOException { TokenStream cjk = cjkAnalyzer.tokenStream(null, new StringReader(src)); CharTermAttribute cjkTermAttr = cjk.addAttribute(CharTermAttribute.class); OffsetAttribute cjkOffsetAttr = cjk.addAttribute(OffsetAttribute.class); PositionIncrementAttribute cjkPosIncAttr = cjk.addAttribute(PositionIncrementAttribute.class); TokenStream uni = universalAnalyzer.tokenStream(null, new StringReader(src)); CharTermAttribute uniTermAttr = uni.addAttribute(CharTermAttribute.class); OffsetAttribute uniOffsetAttr = uni.addAttribute(OffsetAttribute.class); PositionIncrementAttribute uniPosIncAttr = uni.addAttribute(PositionIncrementAttribute.class); while (true) { boolean result = cjk.incrementToken(); Assert.assertEquals(result, uni.incrementToken()); if (!result) { break; }//from ww w . ja v a 2s . co m String term = cjkTermAttr.toString(); Assert.assertEquals(cjkTermAttr, uniTermAttr); if (assertOffset) { Assert.assertEquals(term, cjkOffsetAttr, uniOffsetAttr); } Assert.assertEquals(term, cjkPosIncAttr, uniPosIncAttr); } }
From source file:com.zimbra.cs.index.query.ContactQuery.java
License:Open Source License
public ContactQuery(String text) { TokenStream stream = new ContactTokenFilter( new AddrCharTokenizer(new HalfwidthKanaVoicedMappingFilter(new StringReader(text)))); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); try {// w w w . j a va2 s. co m stream.reset(); while (stream.incrementToken()) { tokens.add(CharMatcher.is('*').trimTrailingFrom(termAttr)); // remove trailing wildcard characters } stream.end(); stream.close(); } catch (IOException e) { // should never happen ZimbraLog.search.error("Failed to tokenize text=%s", text); } }
From source file:com.zimbra.cs.index.query.TextQuery.java
License:Open Source License
TextQuery(TokenStream stream, String field, String text) { this.field = field; this.text = text; try {//ww w.jav a2s . co m CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { tokens.add(termAttr.toString()); } stream.end(); stream.close(); } catch (IOException e) { // should never happen ZimbraLog.search.error("Failed to tokenize text=%s", text); } }
From source file:com.zimbra.cs.index.TermInfo.java
License:Open Source License
/** * Update {@code term2info} with information from {@code field} * * if the field from the Lucene document is indexed and tokenized, for each token: * a) construct a key based on the field name and info about the token * b) if {@code term2info} has an entry for that key, get it, otherwise create an entry * c) update the entry with position information for this token * * @param pos is the current position//from w w w .ja va 2 s . com * @return new value for {@code pos} */ public static int updateMapWithDetailsForField(Analyzer analyzer, Fieldable field, Map<String, TermInfo> term2info, int pos) throws IOException { if (!field.isIndexed()) { return pos; } Character prefix = LuceneFields.FIELD2PREFIX.get(field.name()); if (prefix == null) { ZimbraLog.index.info("TermInfo.updateMapWithDetailsForField - skipping indexed field " + field.name() + " isTokenized=" + field.isTokenized()); return pos; } if (field.isTokenized()) { TokenStream stream = field.tokenStreamValue(); if (stream == null) { stream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); } CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class); stream.reset(); while (stream.incrementToken()) { if (termAttr.length() == 0) { continue; } String term = prefix + termAttr.toString(); TermInfo info = term2info.get(term); if (info == null) { info = new TermInfo(); term2info.put(term, info); } pos += posAttr.getPositionIncrement(); info.addPosition(pos); } } else { // whole field is the only "token". Info potentially getting stored twice - here as well as where // the field is stored. String term = prefix + field.stringValue(); TermInfo info = term2info.get(term); if (info == null) { info = new TermInfo(); term2info.put(term, info); } } return pos; }
From source file:com.zimbra.cs.index.ZimbraAnalyzer.java
License:Open Source License
public static String getAllTokensConcatenated(String fieldName, Reader reader) { StringBuilder toReturn = new StringBuilder(); TokenStream stream = SINGLETON.tokenStream(fieldName, reader); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); try {//from ww w . ja v a 2 s . c om stream.reset(); while (stream.incrementToken()) { toReturn.append(term); toReturn.append(' '); } stream.end(); stream.close(); } catch (IOException e) { e.printStackTrace(); //otherwise eat it } return toReturn.toString(); }
From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java
License:Open Source License
/** * We intentionally disable the positionIncrement because we want phrases to match across removed stop words. * * @see PositionIncrementAttribute/*from w w w .j a v a 2s .com*/ */ @Test public void positionIncrement() throws Exception { TokenStream stream = ZimbraAnalyzer.getInstance().tokenStream(LuceneFields.L_H_SUBJECT, new StringReader("It's a test.")); PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class); while (stream.incrementToken()) { Assert.assertEquals(posIncrAtt.getPositionIncrement(), 1); } stream.end(); stream.close(); }
From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java
License:Open Source License
public static List<String> toTokens(TokenStream stream) throws IOException { List<String> result = new ArrayList<String>(); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset();//from ww w. j a v a2s .c o m while (stream.incrementToken()) { result.add(termAttr.toString()); } stream.end(); return result; }
From source file:com._4dconcept.lucene.highlighter.TokenGroup.java
License:Apache License
public TokenGroup(TokenStream tokenStream) { offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); termAtt = tokenStream.addAttribute(CharTermAttribute.class); }
From source file:crawler.DataSearch.java
License:Apache License
private boolean trash(String tweet) { Analyzer analyzer = new TwitterAnalyzer(); StringReader in = new StringReader(tweet); TokenStream ts = analyzer.tokenStream("body", in); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class); int totalCount = 0, hashCount = 0; //Vector v = new SequentialAccessSparseVector(100); try {//from w ww . ja v a 2 s. co m while (ts.incrementToken()) { char[] termBuffer = termAtt.termBuffer(); int termLen = termAtt.termLength(); //System.out.println(w); if (typeAttr.type().equals("hashtag")) hashCount++; totalCount++; } } catch (IOException e) { e.printStackTrace(); } if (totalCount - hashCount <= 1) return true; else return false; }