de.berlinbuzzwords.AnalyzerPrinter.java Source code

Introduction

Here is the source code for de.berlinbuzzwords.AnalyzerPrinter.java
Source

/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package de.berlinbuzzwords;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

import java.io.IOException;
import java.io.StringReader;

public class AnalyzerPrinter {

    public void printTerms(Analyzer analyzer, String text) throws IOException {
        // Create token stream from reader
        TokenStream stream = analyzer.tokenStream("dummyField", new StringReader(text));

        // Reset stream before token consumption
        stream.reset();

        // Attribute to get the term text for a token
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

        // Output source text
        System.out.println("\ntext: " + text);

        // Analyze text and iterate until end of input
        while (stream.incrementToken()) {
            // Output term text
            System.out.println("  term: " + termAttr);
        }
    }

    public void printTokenDetails(Analyzer analyzer, String text) throws IOException {
        // Create token stream from reader
        TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text));

        // Reset stream before token consumption
        stream.reset();

        // Attribute to get the token term text, offset and type
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class);

        // Output source text
        System.out.println("text: " + text);

        // Analyze text and iterate until end of input
        while (stream.incrementToken()) {
            // Output term text, type and offset
            System.out.println("term: " + termAttr + "\ttype: " + typeAttr.type() + "\tstart offset: "
                    + offsetAttr.startOffset() + "\tend offset: " + offsetAttr.endOffset());
        }
    }
}