org.dbpedia.spotlight.lucene.analysis.NGramAnalyzer.java Source code

Introduction

Here is the source code for org.dbpedia.spotlight.lucene.analysis.NGramAnalyzer.java
Source

/*
 * Copyright 2012 DBpedia Spotlight Development Team
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 *  Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org
 */

package org.dbpedia.spotlight.lucene.analysis;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.position.PositionFilter;

/**
 * @author pablomendes
 */
public class NGramAnalyzer extends Analyzer {

    private int minGram;
    private int maxGram;

    public NGramAnalyzer(int minGram, int maxGram) {
        this.minGram = minGram;
        this.maxGram = maxGram;
    }

    @Override
    public TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream s = new NGramTokenizer(reader, minGram, maxGram);
        s = new PositionFilter(s);
        return s;
    }

    public static void main(String[] args) throws IOException {
        String myString = "cancer";
        Analyzer analyzer = new NGramAnalyzer(3, 3);
        System.out.println("Analyzing: \"" + myString + "\"");
        StringReader reader = new StringReader(myString);
        TokenStream stream = analyzer.tokenStream("field", reader);
        //        TokenStream stream = new NGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK, 1,2);
        stream.reset();

        // print all tokens until stream is exhausted
        while (stream.incrementToken()) {
            System.out.println("token: " + stream);
        }

        stream.end();
        stream.close();
    }
}