Java tutorial
package pl.litwiniuk.rowicki.collocations; /** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ import morfologik.stemming.PolishStemmer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.*; import org.apache.lucene.analysis.morfologik.MorfologikFilter; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.util.Version; import pl.litwiniuk.rowicki.modsynonyms.ModificatedSynonymFilter; import java.io.*; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.text.ParseException; // From chapter 4 public class CollocationAnalyzer extends Analyzer { /** Default maximum allowed token length */ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; private final Version matchVersion; private CollocationEngine engine; public CollocationAnalyzer(CollocationEngine engine, Version matchVersion) { this.engine = engine; this.matchVersion = matchVersion; } /** * Creates a new {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} instance for this analyzer. * * @param fieldName the name of the fields content passed to the * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} sink as a reader * @param reader the reader passed to the {@link org.apache.lucene.analysis.Tokenizer} constructor * @return the {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} for this analyzer. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final StandardTokenizer src = new StandardTokenizer(matchVersion, reader); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new StandardFilter(matchVersion, src); tok = new LowerCaseFilter(matchVersion, tok); tok = new StopFilter(matchVersion, tok, StopAnalyzer.ENGLISH_STOP_WORDS_SET); tok = new MorfologikFilter(tok, PolishStemmer.DICTIONARY.MORFOLOGIK, Version.LUCENE_43); tok = new CollocationFilter(tok, engine); try { tok = new ModificatedSynonymFilter(tok, loadSolrSynonyms(), true); } catch (IOException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } catch (ParseException e) { e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. } return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) throws IOException { src.setMaxTokenLength(CollocationAnalyzer.this.maxTokenLength); super.setReader(reader); } }; } /** * Load synonyms from the solr format, "format=solr". */ private SynonymMap loadSolrSynonyms() throws IOException, ParseException { boolean dedup = true; TokenizerFactory factory; Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, reader); TokenStream stream = new LowerCaseFilter(Version.LUCENE_43, tokenizer); return new TokenStreamComponents(tokenizer, stream); } }; CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder().onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); SolrSynonymParser parser = new SolrSynonymParser(dedup, true, analyzer); File synonymFile = new File("./Parsers/thesaurus.txt"); if (synonymFile.exists()) { decoder.reset(); parser.add(new InputStreamReader(new FileInputStream(synonymFile), decoder)); } return parser.build(); } }