Java tutorial
/* * Copyright 2009-2013 Scale Unlimited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package bixo.examples.webmining; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.Version; public class PhraseShingleAnalyzer { private static final int MAX_WORDS_IN_SHINGLE = 2; private Analyzer _analyzer; public PhraseShingleAnalyzer(int maxWordsInShingle) { _analyzer = new ShingleAnalyzerWrapper(new StandardAnalyzer(Version.LUCENE_42), maxWordsInShingle); } public PhraseShingleAnalyzer() { this(MAX_WORDS_IN_SHINGLE); } public List<String> getTermList(String contentText) { List<String> result = new ArrayList<String>(contentText.length() / 10); try { TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText)); CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { String term = termAtt.toString(); result.add(term); } } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException("Impossible error", e); } return result; } public String getAnalyzedPhrase(String text) { List<String> termList = getTermList(text); StringBuilder builder = new StringBuilder(); int count = 0; for (String term : termList) { if (count > 0) { builder.append(" "); } builder.append(term); count++; } return builder.toString(); } }