Java tutorial
/** * Copyright 2010 Bernhard Haslhofer * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package org.xbib.elasticsearch.index.analysis.skos; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.xbib.elasticsearch.index.analysis.skos.engine.SKOSEngine; import org.xbib.elasticsearch.index.analysis.skos.engine.SKOSEngineFactory; import org.xbib.elasticsearch.index.analysis.skos.tokenattributes.SKOSTypeAttribute; import org.xbib.elasticsearch.plugin.analysis.SKOSAnalysisPlugin; /** * An analyzer for expanding fields that contain either (i) URI references to * SKOS concepts OR (ii) SKOS concept prefLabels as values. */ public class SKOSAnalyzer extends StopwordAnalyzerBase { /** * The supported expansion types */ public enum ExpansionType { URI, LABEL } /** * Default expansion type */ public static final ExpansionType DEFAULT_EXPANSION_TYPE = ExpansionType.LABEL; private ExpansionType expansionType = DEFAULT_EXPANSION_TYPE; /** * Default skos types to expand to */ public static final SKOSTypeAttribute.SKOSType[] DEFAULT_SKOS_TYPES = new SKOSTypeAttribute.SKOSType[] { SKOSTypeAttribute.SKOSType.PREF, SKOSTypeAttribute.SKOSType.ALT, SKOSTypeAttribute.SKOSType.BROADER, SKOSTypeAttribute.SKOSType.BROADERTRANSITIVE, SKOSTypeAttribute.SKOSType.NARROWER, SKOSTypeAttribute.SKOSType.NARROWERTRANSITIVE }; private SKOSTypeAttribute.SKOSType[] types = DEFAULT_SKOS_TYPES; /** * A SKOS Engine instance */ private SKOSEngine skosEngine; /** * The size of the buffer used for multi-term prediction */ private int bufferSize = SKOSLabelFilter.DEFAULT_BUFFER_SIZE; /** * Default maximum allowed token length */ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; /** * An unmodifiable set containing some common English words that are usually * not useful for searching. */ public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; public SKOSAnalyzer(CharArraySet stopWords, SKOSEngine skosEngine, ExpansionType expansionType) { super(SKOSAnalysisPlugin.getLuceneVersion(), stopWords); this.skosEngine = skosEngine; this.expansionType = expansionType; } public SKOSAnalyzer(SKOSEngine skosEngine, ExpansionType expansionType) { this(STOP_WORDS_SET, skosEngine, expansionType); } public SKOSAnalyzer(Reader stopwords, SKOSEngine skosEngine, ExpansionType expansionType) throws IOException { this(loadStopwordSet(stopwords, SKOSAnalysisPlugin.getLuceneVersion()), skosEngine, expansionType); } public SKOSAnalyzer(CharArraySet stopWords, String indexPath, String skosFile, ExpansionType expansionType, int bufferSize, String... languages) throws IOException { super(SKOSAnalysisPlugin.getLuceneVersion(), stopWords); this.skosEngine = SKOSEngineFactory.getSKOSEngine(indexPath, skosFile, languages); this.expansionType = expansionType; this.bufferSize = bufferSize; } public SKOSAnalyzer(String indexPath, String skosFile, ExpansionType expansionType, int bufferSize, String... languages) throws IOException { this(STOP_WORDS_SET, indexPath, skosFile, expansionType, bufferSize, languages); } public SKOSAnalyzer(String indexPath, String skosFile, ExpansionType expansionType, int bufferSize) throws IOException { this(indexPath, skosFile, expansionType, bufferSize, (String[]) null); } public SKOSAnalyzer(String indexPath, String skosFile, ExpansionType expansionType) throws IOException { this(indexPath, skosFile, expansionType, SKOSLabelFilter.DEFAULT_BUFFER_SIZE); } public SKOSAnalyzer(Reader stopwords, String indexPath, String skosFile, ExpansionType expansionType, int bufferSize, String... languages) throws IOException { this(loadStopwordSet(stopwords, SKOSAnalysisPlugin.getLuceneVersion()), indexPath, skosFile, expansionType, bufferSize, languages); } public SKOSTypeAttribute.SKOSType[] getTypes() { return types; } public void setTypes(SKOSTypeAttribute.SKOSType... types) { this.types = types; } /** * Set maximum allowed token length. If a token is seen that exceeds this * length then it is discarded. This setting only takes effect the next time * tokenStream or tokenStream is called. */ public void setMaxTokenLength(int length) { maxTokenLength = length; } /** * @see #setMaxTokenLength */ public int getMaxTokenLength() { return maxTokenLength; } @Override protected TokenStreamComponents createComponents(String fileName, Reader reader) { if (expansionType.equals(ExpansionType.URI)) { final KeywordTokenizer src = new KeywordTokenizer(reader); TokenStream tok = new SKOSURIFilter(src, skosEngine, new StandardAnalyzer(), types); tok = new LowerCaseFilter(tok); return new TokenStreamComponents(src, tok); } else { final StandardTokenizer src = new StandardTokenizer(reader); src.setMaxTokenLength(maxTokenLength); TokenStream tok = new StandardFilter(src); // prior to this we get the classic behavior, standardfilter does it for // us. tok = new SKOSLabelFilter(tok, skosEngine, new StandardAnalyzer(), bufferSize, types); tok = new LowerCaseFilter(tok); tok = new StopFilter(tok, stopwords); tok = new RemoveDuplicatesTokenFilter(tok); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) throws IOException { src.setMaxTokenLength(maxTokenLength); super.setReader(reader); } }; } } }