Java tutorial
/* * Copyright 2012 DBpedia Spotlight Development Team * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Check our project website for information on how to acknowledge the authors and how to contribute to the project: http://spotlight.dbpedia.org */ package org.dbpedia.spotlight.lucene.analysis; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ngram.NGramTokenizer; import org.apache.lucene.analysis.position.PositionFilter; /** * @author pablomendes */ public class NGramAnalyzer extends Analyzer { private int minGram; private int maxGram; public NGramAnalyzer(int minGram, int maxGram) { this.minGram = minGram; this.maxGram = maxGram; } @Override public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream s = new NGramTokenizer(reader, minGram, maxGram); s = new PositionFilter(s); return s; } public static void main(String[] args) throws IOException { String myString = "cancer"; Analyzer analyzer = new NGramAnalyzer(3, 3); System.out.println("Analyzing: \"" + myString + "\""); StringReader reader = new StringReader(myString); TokenStream stream = analyzer.tokenStream("field", reader); // TokenStream stream = new NGramTokenizer(reader, EdgeNGramTokenizer.Side.BACK, 1,2); stream.reset(); // print all tokens until stream is exhausted while (stream.incrementToken()) { System.out.println("token: " + stream); } stream.end(); stream.close(); } }