Example usage for org.apache.lucene.analysis TokenStream end

List of usage examples for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException 

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:org.apache.mahout.text.wikipedia.WikipediaDatasetCreatorMapper.java

License:Apache License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String document = value.toString();
    document = StringEscapeUtils.unescapeHtml4(CLOSE_TEXT_TAG_PATTERN
            .matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
    String catMatch = findMatchingCategory(document);
    if (!"Unknown".equals(catMatch)) {
        StringBuilder contents = new StringBuilder(1000);
        TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
        CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
        stream.reset();//w ww . ja  v a 2 s  .com
        while (stream.incrementToken()) {
            contents.append(termAtt.buffer(), 0, termAtt.length()).append(' ');
        }
        context.write(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")),
                new Text(contents.toString()));
        stream.end();
        Closeables.close(stream, true);
    }
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** normal case, unfiltered analyzer */
@Test/*from www.  j  a  v  a 2s. com*/
public void testAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    validateTokens(allTokens, ts);
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** filtered analyzer */
@Test//from w w w  .  ja  va  2 s.co m
public void testNonKeepdAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    TokenStream f = new BloomTokenFilter(getFilter(filterTokens), false /* toss matching tokens */, ts);
    validateTokens(expectedNonKeepTokens, f);
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** keep analyzer */
@Test/*from   www.  j  ava  2  s .co  m*/
public void testKeepAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    TokenStream f = new BloomTokenFilter(getFilter(filterTokens), true /* keep matching tokens */, ts);
    validateTokens(expectedKeepTokens, f);
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.utils.nlp.collocations.llr.BloomTokenFilterTest.java

License:Apache License

/** shingles, keep those matching whitelist */
@Test// www  .j av a  2 s .com
public void testShingleFilteredAnalyzer() throws IOException {
    Reader reader = new StringReader(input);
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_46);
    TokenStream ts = analyzer.tokenStream(null, reader);
    ts.reset();
    ShingleFilter sf = new ShingleFilter(ts, 3);
    TokenStream f = new BloomTokenFilter(getFilter(shingleKeepTokens), true, sf);
    validateTokens(expectedShingleTokens, f);
    ts.end();
    ts.close();
}

From source file:org.apache.mahout.utils.regex.AnalyzerTransformer.java

License:Apache License

@Override
public String transformMatch(String match) {
    StringBuilder result = new StringBuilder();
    TokenStream ts = null;
    try {/*  w  w  w .j a  v a  2 s.  c  om*/
        ts = analyzer.tokenStream(fieldName, new StringReader(match));
        ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        TokenStreamIterator iter = new TokenStreamIterator(ts);
        while (iter.hasNext()) {
            result.append(iter.next()).append(' ');
        }
        ts.end();
    } catch (IOException e) {
        throw new IllegalStateException(e);
    } finally {
        try {
            Closeables.close(ts, true);
        } catch (IOException e) {
            log.error(e.getMessage(), e);
        }
    }
    return result.toString();
}

From source file:org.apache.mahout.vectorizer.document.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();//from   w  ww . j av a2  s  . c  o  m
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    stream.end();
    Closeables.close(stream, true);
    context.write(key, document);
}

From source file:org.apache.maven.index.DefaultQueryCreator.java

License:Apache License

protected int countTerms(final IndexerField indexerField, final String query) {
    try {/*from   ww w. j  a v a 2 s  .  c o  m*/
        TokenStream ts = nexusAnalyzer.tokenStream(indexerField.getKey(), new StringReader(query));
        ts.reset();

        int result = 0;

        while (ts.incrementToken()) {
            result++;
        }

        ts.end();
        ts.close();

        return result;
    } catch (IOException e) {
        // will not happen
        return 1;
    }
}

From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java

License:Apache License

/**
 * Analyzes the given text using the given analyzer and returns the produced tokens.
 *
 * @param query    The query to analyze.
 * @param analyzer The analyzer to use./*  ww w  . j av a2s  .  c  o m*/
 */
protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
    TokenStream tokenStream = null;
    try {
        tokenStream = analyzer.tokenStream("", query);
        final Set<BytesRef> tokens = new HashSet<BytesRef>();
        final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
        final BytesRef bytes = bytesAtt.getBytesRef();

        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            bytesAtt.fillBytesRef();
            tokens.add(BytesRef.deepCopyOf(bytes));
        }

        tokenStream.end();
        return tokens;
    } catch (IOException ioe) {
        throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
    } finally {
        IOUtils.closeWhileHandlingException(tokenStream);
    }
}

From source file:org.apache.solr.handler.AnalysisRequestHandlerBase.java

License:Apache License

/**
 * Analyzes the given TokenStream, collecting the Tokens it produces.
 *
 * @param tokenStream TokenStream to analyze
 *
 * @return List of tokens produced from the TokenStream
 */// w  ww.  ja  va2  s.  c  o  m
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
    final List<AttributeSource> tokens = new ArrayList<AttributeSource>();
    final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
    // for backwards compatibility, add all "common" attributes
    tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(TypeAttribute.class);
    try {
        tokenStream.reset();
        int position = 0;
        while (tokenStream.incrementToken()) {
            position += posIncrAtt.getPositionIncrement();
            trackerAtt.setActPosition(position);
            tokens.add(tokenStream.cloneAttributes());
        }
        tokenStream.end();
    } catch (IOException ioe) {
        throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
    } finally {
        IOUtils.closeWhileHandlingException(tokenStream);
    }

    return tokens;
}