List of usage examples for org.apache.lucene.analysis.ru RussianAnalyzer RussianAnalyzer
public RussianAnalyzer(CharArraySet stopwords)
From source file:com.bigdata.search.DefaultAnalyzerFactory.java
License:Open Source License
/** * Initializes the various kinds of analyzers that we know about. * <p>//w ww.j a v a 2 s .c o m * Note: Each {@link Analyzer} is registered under both the 3 letter and the * 2 letter language codes. See <a * href="http://www.loc.gov/standards/iso639-2/php/code_list.php">ISO 639-2</a>. * * @todo get some informed advice on which {@link Analyzer}s map onto which * language codes. * * @todo thread safety? Analyzers produce token processors so maybe there is * no problem here once things are initialized. If so, maybe this * could be static. * * @todo configuration. Could be configured by a file containing a class * name and a list of codes that are handled by that class. * * @todo strip language code down to 2/3 characters during lookup. * * @todo There are a lot of pidgins based on french, english, and other * languages that are not being assigned here. */ synchronized private Map<String, AnalyzerConstructor> getAnalyzers() { if (analyzers != null) { return analyzers; } analyzers = new HashMap<String, AnalyzerConstructor>(); final Set<?> emptyStopwords = Collections.EMPTY_SET; { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new BrazilianAnalyzer(Version.LUCENE_CURRENT) : new BrazilianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("por", a); analyzers.put("pt", a); } /* * Claims to handle Chinese. Does single character extraction. Claims to * produce smaller indices as a result. * * Note: you can not tokenize with the Chinese analyzer and the do * search using the CJK analyzer and visa versa. * * Note: I have no idea whether this would work for Japanese and Korean * as well. I expect so, but no real clue. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ChineseAnalyzer(); } }; analyzers.put("zho", a); analyzers.put("chi", a); analyzers.put("zh", a); } /* * Claims to handle Chinese, Japanese, Korean. Does double character * extraction with overlap. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CJKAnalyzer(Version.LUCENE_CURRENT) : new CJKAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; // analyzers.put("zho", a); // analyzers.put("chi", a); // analyzers.put("zh", a); analyzers.put("jpn", a); analyzers.put("ja", a); analyzers.put("jpn", a); analyzers.put("kor", a); analyzers.put("ko", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CzechAnalyzer(Version.LUCENE_CURRENT) : new CzechAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("ces", a); analyzers.put("cze", a); analyzers.put("cs", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new DutchAnalyzer(Version.LUCENE_CURRENT) : new DutchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("dut", a); analyzers.put("nld", a); analyzers.put("nl", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new FrenchAnalyzer(Version.LUCENE_CURRENT) : new FrenchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("fra", a); analyzers.put("fre", a); analyzers.put("fr", a); } /* * Note: There are a lot of language codes for German variants that * might be useful here. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GermanAnalyzer(Version.LUCENE_CURRENT) : new GermanAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("deu", a); analyzers.put("ger", a); analyzers.put("de", a); } // Note: ancient greek has a different code (grc). { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GreekAnalyzer(Version.LUCENE_CURRENT) : new GreekAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("gre", a); analyzers.put("ell", a); analyzers.put("el", a); } // @todo what about other Cyrillic scripts? { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new RussianAnalyzer(Version.LUCENE_CURRENT) : new RussianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("rus", a); analyzers.put("ru", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ThaiAnalyzer(Version.LUCENE_CURRENT); } }; analyzers.put("tha", a); analyzers.put("th", a); } // English { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new StandardAnalyzer(Version.LUCENE_CURRENT) : new StandardAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("eng", a); analyzers.put("en", a); /* * Note: There MUST be an entry under the empty string (""). This * entry will be requested when there is no entry for the specified * language code. */ analyzers.put("", a); } return analyzers; }
From source file:com.bizosys.hsearch.inpipe.TokenizeNonEnglish.java
License:Apache License
public void init(Configuration conf) throws ApplicationFault, SystemFault { languageMap.put("br", new BrazilianAnalyzer(LuceneConstants.version)); languageMap.put("cz", new CzechAnalyzer(LuceneConstants.version)); languageMap.put("nl", new DutchAnalyzer(LuceneConstants.version)); languageMap.put("fr", new FrenchAnalyzer(LuceneConstants.version)); languageMap.put("de", new GermanAnalyzer(LuceneConstants.version)); languageMap.put("el", new GreekAnalyzer(LuceneConstants.version)); languageMap.put("ru", new RussianAnalyzer(LuceneConstants.version)); languageMap.put("th", new ThaiAnalyzer(LuceneConstants.version)); }
From source file:it.unipd.dei.ims.lucene.clef.AnalyzerFactory.java
License:Apache License
public static Analyzer createAnalyzer(String language, String stemmer, CharArraySet stopset) { Analyzer analyzer;/*ww w . j av a2 s . co m*/ if (stemmer.equalsIgnoreCase("NONE")) { analyzer = new StandardAnalyzer(stopset); } else { // otherwise use language-specific analyzer switch (language) { case "bg": analyzer = new BulgarianAnalyzer(stopset); break; case "de": analyzer = new GermanAnalyzer(stopset); break; case "es": analyzer = new SpanishAnalyzer(stopset); break; case "fa": analyzer = new PersianAnalyzer(stopset); break; case "fi": analyzer = new FinnishAnalyzer(stopset); break; case "fr": analyzer = new FrenchAnalyzer(stopset); break; case "hu": analyzer = new HungarianAnalyzer(stopset); break; case "it": analyzer = new ItalianAnalyzer(stopset); break; case "nl": analyzer = new DutchAnalyzer(stopset); break; case "pt": analyzer = new PortugueseAnalyzer(stopset); break; case "ru": analyzer = new RussianAnalyzer(stopset); break; case "sv": analyzer = new SwedishAnalyzer(stopset); break; default: throw new UnsupportedOperationException("Language not supported yet"); } } return analyzer; }
From source file:lucene.Creator.java
public void create(String _filesPath) throws IOException { filesPath = _filesPath;/* w w w . j a v a2 s. c o m*/ Analyzer analyzer = new RussianAnalyzer(Version.LUCENE_48); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer); writer = new IndexWriter(FSDirectory.open(new File(INDEX_PATH)), config); File folder = new File(filesPath); if (!folder.isDirectory()) { return; } int i = 1; Stream<Path> st = Files.walk(folder.toPath(), FileVisitOption.FOLLOW_LINKS); // final long count = st.count(); st.forEach((Path f) -> { File file = f.toFile(); // System.out.println("/"+count+" Adding " + file); System.out.println("Adding " + file); try { Quote q = new Quote(file); Document doc = new Document(); doc.add(new StringField(Lucene.FIELD_NAME, q.rawName, Field.Store.YES)); doc.add(new StringField(Lucene.FIELD_RATING, q.rawRating, Field.Store.YES)); doc.add(new StringField(Lucene.FIELD_DATE, q.rawDate, Field.Store.YES)); doc.add(new StringField(Lucene.FIELD_TEXT_STRING, q.quote, Field.Store.YES)); doc.add(new TextField(Lucene.FIELD_TEXT_TEXT, q.quote, Field.Store.YES)); try { writer.addDocument(doc); } catch (IOException ex) { System.err.println("IndexCreator::indexFolder failed to store " + file.getPath()); } } catch (Exception e) { System.out.println(" failed"); } }); writer.close(); }
From source file:org.apache.jackrabbit.core.query.lucene.LanguageCustomizingAnalyzerRegistry.java
License:Open Source License
public LanguageCustomizingAnalyzerRegistry(IndexingConfiguration configuration) { this.configuration = configuration; languageToAnalyzer.put("ar", new AnalyzerWrapper(new ArabicAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("br", new AnalyzerWrapper(new BrazilianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("cjk", new AnalyzerWrapper(new CJKAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("cn", new AnalyzerWrapper(new ChineseAnalyzer(), true)); languageToAnalyzer.put("cz", new AnalyzerWrapper(new CzechAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("de", new AnalyzerWrapper(new GermanAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("el", new AnalyzerWrapper(new GreekAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("en", new AnalyzerWrapper( new SnowballAnalyzer(Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET), true)); languageToAnalyzer.put("fa", new AnalyzerWrapper(new PersianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("fr", new AnalyzerWrapper(new FrenchAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("nl", new AnalyzerWrapper(new DutchAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("ru", new AnalyzerWrapper(new RussianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("th", new AnalyzerWrapper(new ThaiAnalyzer(Version.LUCENE_30), true)); }
From source file:org.eclipse.help.internal.search.AnalyzerFactory.java
License:Open Source License
public Analyzer create() { if (locale == null) return null; Version version = Version.LUCENE_35; if ("pt".equals(locale)) //$NON-NLS-1$ return new BrazilianAnalyzer(version); if ("ja".equals(locale)) //$NON-NLS-1$ return new CJKAnalyzer(version); if ("ko".equals(locale)) //$NON-NLS-1$ return new CJKAnalyzer(version); if ("pt".equals(locale)) //$NON-NLS-1$ return new BrazilianAnalyzer(version); if ("cs".equals(locale)) //$NON-NLS-1$ return new CzechAnalyzer(version); if ("de".equals(locale)) //$NON-NLS-1$ return new GermanAnalyzer(version); if ("el".equals(locale)) //$NON-NLS-1$ return new GreekAnalyzer(version); if ("fr".equals(locale)) //$NON-NLS-1$ return new FrenchAnalyzer(version); if ("nl".equals(locale)) //$NON-NLS-1$ return new DutchAnalyzer(version); if ("ru".equals(locale)) //$NON-NLS-1$ return new RussianAnalyzer(version); //unknown language return null;//from ww w. j av a 2 s . c o m }
From source file:org.omegat.tokenizer.LuceneRussianTokenizer.java
License:Open Source License
@Override protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) { if (stemsAllowed) { RussianAnalyzer an;// ww w .j a va 2s .co m if (stopWordsAllowed) { an = new RussianAnalyzer(getBehavior()); } else { an = new RussianAnalyzer(getBehavior(), EMPTY_STRING_LIST); } return an.tokenStream("", new StringReader(strOrig)); } else { return new StandardTokenizer(getBehavior(), new StringReader(strOrig)); } }
From source file:org.schors.evlampia.search.LuceneIndexer.java
License:Open Source License
public void open(Directory directory, boolean recreate) throws IOException { config = new IndexWriterConfig(Version.LUCENE_36, new RussianAnalyzer(Version.LUCENE_36)); config.setOpenMode(recreate ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND); indexWriter = new IndexWriter(directory, config); }
From source file:perLucene.Server.java
License:Open Source License
private static void initAnalyzers() { ha = new HashMap<String, Analyzer>(); ha.put("ar", new ArabicAnalyzer(Version.LUCENE_41)); ha.put("el", new GreekAnalyzer(Version.LUCENE_41)); ha.put("bg", new BulgarianAnalyzer(Version.LUCENE_41)); ha.put("br", new BrazilianAnalyzer(Version.LUCENE_41)); ha.put("ca", new CatalanAnalyzer(Version.LUCENE_41)); ha.put("cz", new CzechAnalyzer(Version.LUCENE_41)); ha.put("da", new DanishAnalyzer(Version.LUCENE_41)); ha.put("de", new GermanAnalyzer(Version.LUCENE_41)); ha.put("en", new EnglishAnalyzer(Version.LUCENE_41)); ha.put("es", new SpanishAnalyzer(Version.LUCENE_41)); ha.put("eu", new BasqueAnalyzer(Version.LUCENE_41)); ha.put("fa", new PersianAnalyzer(Version.LUCENE_41)); ha.put("fi", new FinnishAnalyzer(Version.LUCENE_41)); ha.put("fr", new FrenchAnalyzer(Version.LUCENE_41)); ha.put("ga", new IrishAnalyzer(Version.LUCENE_41)); ha.put("gl", new GalicianAnalyzer(Version.LUCENE_41)); ha.put("hi", new HindiAnalyzer(Version.LUCENE_41)); ha.put("hu", new HungarianAnalyzer(Version.LUCENE_41)); ha.put("hy", new ArmenianAnalyzer(Version.LUCENE_41)); ha.put("id", new IndonesianAnalyzer(Version.LUCENE_41)); ha.put("it", new ItalianAnalyzer(Version.LUCENE_41)); ha.put("lv", new LatvianAnalyzer(Version.LUCENE_41)); ha.put("nl", new DutchAnalyzer(Version.LUCENE_41)); ha.put("no", new NorwegianAnalyzer(Version.LUCENE_41)); ha.put("pt", new PortugueseAnalyzer(Version.LUCENE_41)); ha.put("ro", new RomanianAnalyzer(Version.LUCENE_41)); ha.put("ru", new RussianAnalyzer(Version.LUCENE_41)); ha.put("sv", new SwedishAnalyzer(Version.LUCENE_41)); ha.put("th", new ThaiAnalyzer(Version.LUCENE_41)); ha.put("tr", new TurkishAnalyzer(Version.LUCENE_41)); ha.put("cn", new SmartChineseAnalyzer(Version.LUCENE_41)); }