List of usage examples for org.apache.lucene.analysis.bg BulgarianAnalyzer getDefaultStopSet
public static CharArraySet getDefaultStopSet()
From source file:it.unipd.dei.ims.lucene.clef.AnalyzerFactory.java
License:Apache License
public static CharArraySet createStopset(String language, String stopsetType, String stopsetPath) throws Exception { CharArraySet stopset = CharArraySet.EMPTY_SET; if (stopsetType.equalsIgnoreCase("CUSTOM")) { try {/*from ww w . j a v a2 s . co m*/ File f = new File(stopsetPath); stopset = new CharArraySet(0, true); Scanner sc = new Scanner(f); logger.debug("STOPLIST:"); while (sc.hasNextLine()) { String stopword = sc.nextLine().trim(); logger.debug("=> " + stopword); stopset.add(stopword); } logger.debug(""); sc.close(); } catch (FileNotFoundException e) { e.printStackTrace(); throw new Exception("FileNotFoundException when loading stopset"); } } else if (stopsetType.equalsIgnoreCase("DEFAULT")) { switch (language) { case "bg": stopset = BulgarianAnalyzer.getDefaultStopSet(); break; case "de": stopset = GermanAnalyzer.getDefaultStopSet(); break; case "es": stopset = SpanishAnalyzer.getDefaultStopSet(); break; case "fa": stopset = PersianAnalyzer.getDefaultStopSet(); break; case "fi": stopset = FinnishAnalyzer.getDefaultStopSet(); break; case "fr": stopset = FrenchAnalyzer.getDefaultStopSet(); break; case "hu": stopset = HungarianAnalyzer.getDefaultStopSet(); break; case "it": stopset = ItalianAnalyzer.getDefaultStopSet(); break; case "nl": stopset = DutchAnalyzer.getDefaultStopSet(); break; case "pt": stopset = PortugueseAnalyzer.getDefaultStopSet(); break; case "ru": stopset = RussianAnalyzer.getDefaultStopSet(); break; case "sv": stopset = SwedishAnalyzer.getDefaultStopSet(); break; default: throw new UnsupportedOperationException("Language not supported yet"); } } return stopset; }
From source file:org.elasticsearch.analysis.common.BulgarianAnalyzerProvider.java
License:Apache License
BulgarianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new BulgarianAnalyzer(
Analysis.parseStopWords(env, settings, BulgarianAnalyzer.getDefaultStopSet()),
Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
analyzer.setVersion(version);/*from w ww . jav a 2s. c o m*/
}
From source file:org.elasticsearch.index.analysis.BulgarianAnalyzerProvider.java
License:Apache License
@Inject public BulgarianAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); analyzer = new BulgarianAnalyzer(version, Analysis.parseStopWords(env, settings, BulgarianAnalyzer.getDefaultStopSet(), version), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET, version)); }
From source file:org.omegat.tokenizer.LuceneBulgarianTokenizer.java
License:Open Source License
@Override protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) { if (stemsAllowed) { Set<?> stopWords = stopWordsAllowed ? BulgarianAnalyzer.getDefaultStopSet() : Collections.EMPTY_SET; return new BulgarianAnalyzer(getBehavior(), stopWords).tokenStream("", new StringReader(strOrig)); } else {//from w ww. jav a 2 s.c om return new StandardTokenizer(getBehavior(), new StringReader(strOrig)); } }