List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:com.yourcompany.hadoop.mapreduce.lexical.LexicalAnalyzerMapper.java
License:Apache License
@Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String row = value.toString(); TokenStream tokenStream = analyzer.tokenStream("dummy", new StringReader(row)); tokenStream.reset(); List<String> tokens = collectExtractedNouns(tokenStream); for (String token : tokens) { context.write(NullWritable.get(), new Text(token)); }/*from w ww . j a v a 2 s. co m*/ }
From source file:com.zimbra.cs.index.analysis.RFC822AddressTokenStreamTest.java
License:Open Source License
@Test public void reset() throws Exception { TokenStream stream = new RFC822AddressTokenStream("user@domain.com"); stream.reset(); Assert.assertEquals(//from ww w .j av a2 s . c om Arrays.asList("user@domain.com", "user", "@domain.com", "domain.com", "domain", "@domain"), ZimbraAnalyzerTest.toTokens(stream)); stream.reset(); Assert.assertEquals( Arrays.asList("user@domain.com", "user", "@domain.com", "domain.com", "domain", "@domain"), ZimbraAnalyzerTest.toTokens(stream)); }
From source file:com.zimbra.cs.index.query.ContactQuery.java
License:Open Source License
public ContactQuery(String text) { TokenStream stream = new ContactTokenFilter( new AddrCharTokenizer(new HalfwidthKanaVoicedMappingFilter(new StringReader(text)))); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); try {/*from w w w . j a va2 s . co m*/ stream.reset(); while (stream.incrementToken()) { tokens.add(CharMatcher.is('*').trimTrailingFrom(termAttr)); // remove trailing wildcard characters } stream.end(); stream.close(); } catch (IOException e) { // should never happen ZimbraLog.search.error("Failed to tokenize text=%s", text); } }
From source file:com.zimbra.cs.index.query.TextQuery.java
License:Open Source License
TextQuery(TokenStream stream, String field, String text) { this.field = field; this.text = text; try {/* w w w . ja v a2 s . co m*/ CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { tokens.add(termAttr.toString()); } stream.end(); stream.close(); } catch (IOException e) { // should never happen ZimbraLog.search.error("Failed to tokenize text=%s", text); } }
From source file:com.zimbra.cs.index.TermInfo.java
License:Open Source License
/** * Update {@code term2info} with information from {@code field} * * if the field from the Lucene document is indexed and tokenized, for each token: * a) construct a key based on the field name and info about the token * b) if {@code term2info} has an entry for that key, get it, otherwise create an entry * c) update the entry with position information for this token * * @param pos is the current position//www . ja va 2 s. com * @return new value for {@code pos} */ public static int updateMapWithDetailsForField(Analyzer analyzer, Fieldable field, Map<String, TermInfo> term2info, int pos) throws IOException { if (!field.isIndexed()) { return pos; } Character prefix = LuceneFields.FIELD2PREFIX.get(field.name()); if (prefix == null) { ZimbraLog.index.info("TermInfo.updateMapWithDetailsForField - skipping indexed field " + field.name() + " isTokenized=" + field.isTokenized()); return pos; } if (field.isTokenized()) { TokenStream stream = field.tokenStreamValue(); if (stream == null) { stream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); } CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class); stream.reset(); while (stream.incrementToken()) { if (termAttr.length() == 0) { continue; } String term = prefix + termAttr.toString(); TermInfo info = term2info.get(term); if (info == null) { info = new TermInfo(); term2info.put(term, info); } pos += posAttr.getPositionIncrement(); info.addPosition(pos); } } else { // whole field is the only "token". Info potentially getting stored twice - here as well as where // the field is stored. String term = prefix + field.stringValue(); TermInfo info = term2info.get(term); if (info == null) { info = new TermInfo(); term2info.put(term, info); } } return pos; }
From source file:com.zimbra.cs.index.ZimbraAnalyzer.java
License:Open Source License
public static String getAllTokensConcatenated(String fieldName, Reader reader) { StringBuilder toReturn = new StringBuilder(); TokenStream stream = SINGLETON.tokenStream(fieldName, reader); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); try {//from w w w . j a v a2 s .c o m stream.reset(); while (stream.incrementToken()) { toReturn.append(term); toReturn.append(' '); } stream.end(); stream.close(); } catch (IOException e) { e.printStackTrace(); //otherwise eat it } return toReturn.toString(); }
From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java
License:Open Source License
public static List<String> toTokens(TokenStream stream) throws IOException { List<String> result = new ArrayList<String>(); CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { result.add(termAttr.toString()); }/* w ww . j av a 2s .com*/ stream.end(); return result; }
From source file:com._4dconcept.lucene.highlighter.GenericHighlighter.java
License:Apache License
public void highlight(String toHighlight, String field) throws IOException, ParseException { TokenStream tokenStream = analyzer.reusableTokenStream(field, new StringReader(toHighlight)); QueryTermScorer queryTermScorer = new QueryTermScorer(query); TokenStream newStream = queryTermScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream;/*from w ww. j av a2s . co m*/ } //tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); queryTermScorer.startFragment(null); int lastEndOffset = 0; TokenGroup tokenGroup = new TokenGroup(tokenStream); for (boolean next = tokenStream.incrementToken(); next; next = tokenStream.incrementToken()) { if ((tokenGroup.numTokens > 0) && tokenGroup.isDistinct()) { lastEndOffset = extractText(tokenGroup, toHighlight, lastEndOffset); } tokenGroup.addToken(queryTermScorer.getTokenScore()); } if (tokenGroup.numTokens > 0) { lastEndOffset = extractText(tokenGroup, toHighlight, lastEndOffset); } //Test what remains of the original text beyond the point where we stopped analyzing if ((lastEndOffset < toHighlight.length())) { //append it to the last fragment callback.terms(toHighlight.substring(lastEndOffset), lastEndOffset, tokenGroup.getTotalScore()); } }
From source file:CopulaResources.TermCooccurence.java
private static List tokenizeString(Analyzer analyzer, String str) { List result = new ArrayList<>(); try {/* www . j av a 2 s. c om*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(str)); stream.reset(); while (stream.incrementToken()) result.add(stream.getAttribute(CharTermAttribute.class).toString()); stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:de.berlinbuzzwords.AnalyzerPrinter.java
License:Apache License
public void printTerms(Analyzer analyzer, String text) throws IOException { // Create token stream from reader TokenStream stream = analyzer.tokenStream("dummyField", new StringReader(text)); // Reset stream before token consumption stream.reset(); // Attribute to get the term text for a token CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class); // Output source text System.out.println("\ntext: " + text); // Analyze text and iterate until end of input while (stream.incrementToken()) { // Output term text System.out.println(" term: " + termAttr); }//from w ww . j a va 2 s. com }