Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:com.yourcompany.hadoop.mapreduce.lexical.LexicalAnalyzerMapper.java

License:Apache License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String row = value.toString();
    TokenStream tokenStream = analyzer.tokenStream("dummy", new StringReader(row));
    tokenStream.reset();
    List<String> tokens = collectExtractedNouns(tokenStream);
    for (String token : tokens) {
        context.write(NullWritable.get(), new Text(token));
    }/*from w ww  .  j  a  v  a  2  s. co m*/
}

From source file:com.zimbra.cs.index.analysis.RFC822AddressTokenStreamTest.java

License:Open Source License

@Test
public void reset() throws Exception {
    TokenStream stream = new RFC822AddressTokenStream("user@domain.com");
    stream.reset();
    Assert.assertEquals(//from ww  w  .j av  a2  s  . c om
            Arrays.asList("user@domain.com", "user", "@domain.com", "domain.com", "domain", "@domain"),
            ZimbraAnalyzerTest.toTokens(stream));
    stream.reset();
    Assert.assertEquals(
            Arrays.asList("user@domain.com", "user", "@domain.com", "domain.com", "domain", "@domain"),
            ZimbraAnalyzerTest.toTokens(stream));
}

From source file:com.zimbra.cs.index.query.ContactQuery.java

License:Open Source License

public ContactQuery(String text) {
    TokenStream stream = new ContactTokenFilter(
            new AddrCharTokenizer(new HalfwidthKanaVoicedMappingFilter(new StringReader(text))));
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    try {/*from   w w  w . j  a  va2  s .  co m*/
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(CharMatcher.is('*').trimTrailingFrom(termAttr)); // remove trailing wildcard characters
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.query.TextQuery.java

License:Open Source License

TextQuery(TokenStream stream, String field, String text) {
    this.field = field;
    this.text = text;

    try {/* w  w w .  ja  v a2 s . co  m*/
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(termAttr.toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) { // should never happen
        ZimbraLog.search.error("Failed to tokenize text=%s", text);
    }
}

From source file:com.zimbra.cs.index.TermInfo.java

License:Open Source License

/**
 * Update {@code term2info} with information from {@code field}
 *
 *  if the field from the Lucene document is indexed and tokenized, for each token:
 *      a)   construct a key based on the field name and info about the token
 *      b)   if {@code term2info} has an entry for that key, get it, otherwise create an entry
 *      c)   update the entry with position information for this token
 *
 * @param pos is the current position//www  . ja va 2  s.  com
 * @return new value for {@code pos}
 */
public static int updateMapWithDetailsForField(Analyzer analyzer, Fieldable field,
        Map<String, TermInfo> term2info, int pos) throws IOException {
    if (!field.isIndexed()) {
        return pos;
    }
    Character prefix = LuceneFields.FIELD2PREFIX.get(field.name());
    if (prefix == null) {
        ZimbraLog.index.info("TermInfo.updateMapWithDetailsForField - skipping indexed field " + field.name()
                + " isTokenized=" + field.isTokenized());
        return pos;
    }
    if (field.isTokenized()) {
        TokenStream stream = field.tokenStreamValue();
        if (stream == null) {
            stream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
        }
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            if (termAttr.length() == 0) {
                continue;
            }
            String term = prefix + termAttr.toString();
            TermInfo info = term2info.get(term);
            if (info == null) {
                info = new TermInfo();
                term2info.put(term, info);
            }
            pos += posAttr.getPositionIncrement();
            info.addPosition(pos);
        }
    } else {
        // whole field is the only "token".  Info potentially getting stored twice - here as well as where
        // the field is stored.
        String term = prefix + field.stringValue();
        TermInfo info = term2info.get(term);
        if (info == null) {
            info = new TermInfo();
            term2info.put(term, info);
        }
    }
    return pos;
}

From source file:com.zimbra.cs.index.ZimbraAnalyzer.java

License:Open Source License

public static String getAllTokensConcatenated(String fieldName, Reader reader) {
    StringBuilder toReturn = new StringBuilder();

    TokenStream stream = SINGLETON.tokenStream(fieldName, reader);
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);

    try {//from   w  w  w . j a  v  a2 s  .c o  m
        stream.reset();
        while (stream.incrementToken()) {
            toReturn.append(term);
            toReturn.append(' ');
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        e.printStackTrace(); //otherwise eat it
    }

    return toReturn.toString();
}

From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java

License:Open Source License

public static List<String> toTokens(TokenStream stream) throws IOException {
    List<String> result = new ArrayList<String>();
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        result.add(termAttr.toString());
    }/*  w  ww .  j  av a 2s  .com*/
    stream.end();
    return result;
}

From source file:com._4dconcept.lucene.highlighter.GenericHighlighter.java

License:Apache License

public void highlight(String toHighlight, String field) throws IOException, ParseException {

    TokenStream tokenStream = analyzer.reusableTokenStream(field, new StringReader(toHighlight));
    QueryTermScorer queryTermScorer = new QueryTermScorer(query);

    TokenStream newStream = queryTermScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;/*from w  ww.  j av  a2s  . co m*/
    }

    //tokenStream.addAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();

    queryTermScorer.startFragment(null);

    int lastEndOffset = 0;

    TokenGroup tokenGroup = new TokenGroup(tokenStream);

    for (boolean next = tokenStream.incrementToken(); next; next = tokenStream.incrementToken()) {

        if ((tokenGroup.numTokens > 0) && tokenGroup.isDistinct()) {
            lastEndOffset = extractText(tokenGroup, toHighlight, lastEndOffset);
        }
        tokenGroup.addToken(queryTermScorer.getTokenScore());
    }

    if (tokenGroup.numTokens > 0) {
        lastEndOffset = extractText(tokenGroup, toHighlight, lastEndOffset);
    }

    //Test what remains of the original text beyond the point where we stopped analyzing
    if ((lastEndOffset < toHighlight.length())) {
        //append it to the last fragment
        callback.terms(toHighlight.substring(lastEndOffset), lastEndOffset, tokenGroup.getTotalScore());
    }
}

From source file:CopulaResources.TermCooccurence.java

private static List tokenizeString(Analyzer analyzer, String str) {
    List result = new ArrayList<>();
    try {/*  www  .  j av a 2  s. c  om*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(str));
        stream.reset();
        while (stream.incrementToken())
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:de.berlinbuzzwords.AnalyzerPrinter.java

License:Apache License

public void printTerms(Analyzer analyzer, String text) throws IOException {
    // Create token stream from reader
    TokenStream stream = analyzer.tokenStream("dummyField", new StringReader(text));

    // Reset stream before token consumption
    stream.reset();

    // Attribute to get the term text for a token
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

    // Output source text
    System.out.println("\ntext: " + text);

    // Analyze text and iterate until end of input
    while (stream.incrementToken()) {
        // Output term text
        System.out.println("  term: " + termAttr);
    }//from   w  ww .  j a  va  2  s. com
}