pl.litwiniuk.rowicki.collocations.CollocationAnalyzer.java Source code

Introduction

Here is the source code for pl.litwiniuk.rowicki.collocations.CollocationAnalyzer.java
Source

package pl.litwiniuk.rowicki.collocations; /**
                                           * Copyright Manning Publications Co.
                                           *
                                           * Licensed under the Apache License, Version 2.0 (the "License");
                                           * you may not use this file except in compliance with the License.
                                           * You may obtain a copy of the License at
                                           *
                                           *     http://www.apache.org/licenses/LICENSE-2.0
                                           *
                                           * Unless required by applicable law or agreed to in writing, software
                                           * distributed under the License is distributed on an "AS IS" BASIS,
                                           * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
                                           * See the License for the specific lan      
                                           */

import morfologik.stemming.PolishStemmer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.*;
import org.apache.lucene.analysis.morfologik.MorfologikFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.Version;
import pl.litwiniuk.rowicki.modsynonyms.ModificatedSynonymFilter;

import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.text.ParseException;

// From chapter 4
public class CollocationAnalyzer extends Analyzer {

    /** Default maximum allowed token length */
    public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;

    private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;

    private final Version matchVersion;
    private CollocationEngine engine;

    public CollocationAnalyzer(CollocationEngine engine, Version matchVersion) {
        this.engine = engine;
        this.matchVersion = matchVersion;
    }

    /**
     * Creates a new {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} instance for this analyzer.
     *
     * @param fieldName the name of the fields content passed to the
     *                  {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} sink as a reader
     * @param reader    the reader passed to the {@link org.apache.lucene.analysis.Tokenizer} constructor
     * @return the {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} for this analyzer.
     */
    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        final StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
        src.setMaxTokenLength(maxTokenLength);
        TokenStream tok = new StandardFilter(matchVersion, src);
        tok = new LowerCaseFilter(matchVersion, tok);
        tok = new StopFilter(matchVersion, tok, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
        tok = new MorfologikFilter(tok, PolishStemmer.DICTIONARY.MORFOLOGIK, Version.LUCENE_43);
        tok = new CollocationFilter(tok, engine);
        try {
            tok = new ModificatedSynonymFilter(tok, loadSolrSynonyms(), true);
        } catch (IOException e) {
            e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
        } catch (ParseException e) {
            e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates.
        }
        return new TokenStreamComponents(src, tok) {
            @Override
            protected void setReader(final Reader reader) throws IOException {
                src.setMaxTokenLength(CollocationAnalyzer.this.maxTokenLength);
                super.setReader(reader);
            }
        };
    }

    /**
     * Load synonyms from the solr format, "format=solr".
     */
    private SynonymMap loadSolrSynonyms() throws IOException, ParseException {
        boolean dedup = true;

        TokenizerFactory factory;

        Analyzer analyzer = new Analyzer() {
            @Override
            protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
                Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, reader);
                TokenStream stream = new LowerCaseFilter(Version.LUCENE_43, tokenizer);
                return new TokenStreamComponents(tokenizer, stream);
            }
        };

        CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder().onMalformedInput(CodingErrorAction.REPORT)
                .onUnmappableCharacter(CodingErrorAction.REPORT);

        SolrSynonymParser parser = new SolrSynonymParser(dedup, true, analyzer);
        File synonymFile = new File("./Parsers/thesaurus.txt");
        if (synonymFile.exists()) {
            decoder.reset();
            parser.add(new InputStreamReader(new FileInputStream(synonymFile), decoder));
        }
        return parser.build();
    }
}