List of usage examples for org.apache.lucene.analysis.cjk CJKAnalyzer CJKAnalyzer
public CJKAnalyzer(CharArraySet stopwords)
From source file:com.bigdata.search.DefaultAnalyzerFactory.java
License:Open Source License
/** * Initializes the various kinds of analyzers that we know about. * <p>/*from ww w .ja v a 2 s . com*/ * Note: Each {@link Analyzer} is registered under both the 3 letter and the * 2 letter language codes. See <a * href="http://www.loc.gov/standards/iso639-2/php/code_list.php">ISO 639-2</a>. * * @todo get some informed advice on which {@link Analyzer}s map onto which * language codes. * * @todo thread safety? Analyzers produce token processors so maybe there is * no problem here once things are initialized. If so, maybe this * could be static. * * @todo configuration. Could be configured by a file containing a class * name and a list of codes that are handled by that class. * * @todo strip language code down to 2/3 characters during lookup. * * @todo There are a lot of pidgins based on french, english, and other * languages that are not being assigned here. */ synchronized private Map<String, AnalyzerConstructor> getAnalyzers() { if (analyzers != null) { return analyzers; } analyzers = new HashMap<String, AnalyzerConstructor>(); final Set<?> emptyStopwords = Collections.EMPTY_SET; { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new BrazilianAnalyzer(Version.LUCENE_CURRENT) : new BrazilianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("por", a); analyzers.put("pt", a); } /* * Claims to handle Chinese. Does single character extraction. Claims to * produce smaller indices as a result. * * Note: you can not tokenize with the Chinese analyzer and the do * search using the CJK analyzer and visa versa. * * Note: I have no idea whether this would work for Japanese and Korean * as well. I expect so, but no real clue. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ChineseAnalyzer(); } }; analyzers.put("zho", a); analyzers.put("chi", a); analyzers.put("zh", a); } /* * Claims to handle Chinese, Japanese, Korean. Does double character * extraction with overlap. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CJKAnalyzer(Version.LUCENE_CURRENT) : new CJKAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; // analyzers.put("zho", a); // analyzers.put("chi", a); // analyzers.put("zh", a); analyzers.put("jpn", a); analyzers.put("ja", a); analyzers.put("jpn", a); analyzers.put("kor", a); analyzers.put("ko", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CzechAnalyzer(Version.LUCENE_CURRENT) : new CzechAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("ces", a); analyzers.put("cze", a); analyzers.put("cs", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new DutchAnalyzer(Version.LUCENE_CURRENT) : new DutchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("dut", a); analyzers.put("nld", a); analyzers.put("nl", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new FrenchAnalyzer(Version.LUCENE_CURRENT) : new FrenchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("fra", a); analyzers.put("fre", a); analyzers.put("fr", a); } /* * Note: There are a lot of language codes for German variants that * might be useful here. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GermanAnalyzer(Version.LUCENE_CURRENT) : new GermanAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("deu", a); analyzers.put("ger", a); analyzers.put("de", a); } // Note: ancient greek has a different code (grc). { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GreekAnalyzer(Version.LUCENE_CURRENT) : new GreekAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("gre", a); analyzers.put("ell", a); analyzers.put("el", a); } // @todo what about other Cyrillic scripts? { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new RussianAnalyzer(Version.LUCENE_CURRENT) : new RussianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("rus", a); analyzers.put("ru", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ThaiAnalyzer(Version.LUCENE_CURRENT); } }; analyzers.put("tha", a); analyzers.put("th", a); } // English { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new StandardAnalyzer(Version.LUCENE_CURRENT) : new StandardAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("eng", a); analyzers.put("en", a); /* * Note: There MUST be an entry under the empty string (""). This * entry will be requested when there is no entry for the specified * language code. */ analyzers.put("", a); } return analyzers; }
From source file:cz.muni.fi.japanesedictionary.engine.CharacterLoader.java
License:Open Source License
/** * Searchs for japanese characters in KanjiDict2. * //from w w w.jav a2s . co m * @param params string which contains characters * @return Map<String, JapaneseCharacter> if some characters were found returns map else null */ @SuppressWarnings("MalformedRegex") @Override protected Map<String, JapaneseCharacter> doInBackground(String... params) { String characterList = params[0]; if (characterList == null || characterList.length() < 1) { return null; } SharedPreferences settings = mContext.getSharedPreferences(ParserService.DICTIONARY_PREFERENCES, 0); String pathToDictionary = settings.getString(Const.PREF_KANJIDIC_PATH, null); if (pathToDictionary == null) { Log.e(LOG_TAG, "No path to kanjidict2 dictionary"); return null; } File file = new File(pathToDictionary); if (file == null || !file.exists() || !file.canRead()) { Log.e(LOG_TAG, "Can't read dictionary directory"); return null; } StringBuilder searchBuilder = new StringBuilder(); final int characterListSize = characterList.length(); // search string for (int i = 0; i < characterListSize; i++) { String character = String.valueOf(characterList.charAt(i)); if (Pattern.matches("\\p{Han}", character)) { if (i > 0) { //searchBuilder.length() > 0 searchBuilder.append(' '); // in lucene space serve as OR } searchBuilder.append('"').append(character).append('"'); } } String search = searchBuilder.toString(); if (search.length() == 0) { return null; } Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_36); try { QueryParser query = new QueryParser(Version.LUCENE_36, "literal", analyzer); query.setPhraseSlop(0); Query q = query.parse(search); if (mSearcher == null) { Directory dir = FSDirectory.open(file); IndexReader reader = IndexReader.open(dir); mSearcher = new IndexSearcher(reader); } TopScoreDocCollector collector = TopScoreDocCollector.create(100, true); mSearcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; Map<String, JapaneseCharacter> result = new HashMap<>(); for (ScoreDoc document : hits) { int docId = document.doc; Document d = mSearcher.doc(docId); JapaneseCharacter japanCharacter = new JapaneseCharacter(); String literal = d.get("literal"); if (literal != null && literal.length() > 0) { japanCharacter.setLiteral(literal); } String radicalClassic = d.get("radicalClassic"); if (radicalClassic != null && radicalClassic.length() > 0) { try { int radicalClassicInt = Integer.parseInt(radicalClassic); if (radicalClassicInt > 0) { japanCharacter.setRadicalClassic(radicalClassicInt); } } catch (NumberFormatException ex) { Log.w(LOG_TAG, "Couldn't parse radical-classical: " + radicalClassic); } } String grade = d.get("grade"); if (grade != null && grade.length() > 0) { try { int gradeInt = Integer.parseInt(grade); if (gradeInt > 0) { japanCharacter.setGrade(gradeInt); } } catch (NumberFormatException ex) { Log.w(LOG_TAG, "Couldn't parse grade: " + grade); } } String strokeCount = d.get("strokeCount"); if (strokeCount != null && strokeCount.length() > 0) { try { int strokeCountInt = Integer.parseInt(strokeCount); if (strokeCountInt > 0) { japanCharacter.setStrokeCount(strokeCountInt); } } catch (NumberFormatException ex) { Log.w(LOG_TAG, "Couldn't parse strokeCount: " + strokeCount); } } String skip = d.get("queryCodeSkip"); if (skip != null && skip.length() > 0) { japanCharacter.setSkip(skip); } String dicRef = d.get("dicRef"); if (dicRef != null && dicRef.length() > 0) { japanCharacter.parseDicRef(dicRef); } String rmGroupJaOn = d.get("rmGroupJaOn"); if (rmGroupJaOn != null && rmGroupJaOn.length() > 0) { japanCharacter.parseRmGroupJaOn(rmGroupJaOn); } String rmGroupJaKun = d.get("rmGroupJaKun"); if (rmGroupJaKun != null && rmGroupJaKun.length() > 0) { japanCharacter.parseRmGroupJaKun(rmGroupJaKun); } String meaningEnglish = d.get("meaningEnglish"); if (meaningEnglish != null && meaningEnglish.length() > 0) { japanCharacter.parseMeaningEnglish(meaningEnglish); } String meaningFrench = d.get("meaningFrench"); if (meaningFrench != null && meaningFrench.length() > 0) { japanCharacter.parseMeaningFrench(meaningFrench); } String meaningDutch = d.get("meaningDutch"); if (meaningDutch != null && meaningDutch.length() > 0) { japanCharacter.parseMeaningDutch(meaningDutch); } String meaningGerman = d.get("meaningGerman"); if (meaningGerman != null && meaningGerman.length() > 0) { japanCharacter.parseMeaningGerman(meaningGerman); } String meaningRussian = d.get("meaningRussian"); if (meaningRussian != null && meaningRussian.length() > 0) { japanCharacter.parseMeaningRussian(meaningRussian); } String nanori = d.get("nanori"); if (nanori != null && nanori.length() > 0) { japanCharacter.parseNanori(nanori); } if (japanCharacter.getLiteral() != null && japanCharacter.getLiteral().length() > 0) { result.put(japanCharacter.getLiteral(), japanCharacter); } } return result.size() > 0 ? result : null; } catch (ParseException ex) { Log.e(LOG_TAG, "Searching for charaters ParseException caught: " + ex); } catch (IOException ex) { Log.e(LOG_TAG, "Searching for charaters IOException caught: " + ex); } catch (Exception ex) { Log.e(LOG_TAG, "Searching for charaters Exception caught: " + ex); } return null; }
From source file:cz.muni.fi.japanesedictionary.engine.FragmentListAsyncTask.java
License:Open Source License
/** * Loads translation using Lucene/*from www . j av a 2s . c om*/ */ @Override protected List<Translation> doInBackground(String... params) { String expression = params[0]; String part = params[1]; SharedPreferences settings = mContext.getSharedPreferences(ParserService.DICTIONARY_PREFERENCES, 0); String pathToDictionary = settings.getString(Const.PREF_JMDICT_PATH, null); SharedPreferences sharedPrefs = PreferenceManager.getDefaultSharedPreferences(mContext); final boolean englishBool = sharedPrefs.getBoolean("language_english", false); final boolean frenchBool = sharedPrefs.getBoolean("language_french", false); final boolean dutchBool = sharedPrefs.getBoolean("language_dutch", false); final boolean germanBool = sharedPrefs.getBoolean("language_german", false); final boolean russianBool = sharedPrefs.getBoolean("language_russian", false); final boolean searchOnlyFavorised = sharedPrefs.getBoolean("search_only_favorite", false); final boolean searchDeinflected = sharedPrefs.getBoolean("search_deinflected", false); final List<Translation> translations = new ArrayList<>(); if (expression == null) { // first run Log.i(LOG_TAG, "First run - last 10 translations "); GlossaryReaderContract database = new GlossaryReaderContract(mContext); List<Translation> translationsTemp = database.getLastTranslations(10); database.close(); return translationsTemp; } if (pathToDictionary == null) { Log.e(LOG_TAG, "No path to jmdict dictionary"); return null; } File file = new File(pathToDictionary); if (!file.exists() || !file.canRead()) { Log.e(LOG_TAG, "Can't read jmdict dictionary directory"); return null; } if (expression.length() < 1) { Log.w(LOG_TAG, "No expression to translate"); return null; } Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_36); IndexReader reader; try { final String search; final String hiragana; boolean onlyReb = false; if (Pattern.matches("\\p{Latin}*", expression)) { // only romaji onlyReb = true; Log.i(LOG_TAG, "Only latin letters, converting to hiragana. "); expression = TranscriptionConverter.kunreiToHepburn(expression); expression = RomanizationEnum.Hepburn.toHiragana(expression); } hiragana = expression; expression = insertSpaces(expression); switch (part) { case "end": search = "\"" + expression + "lucenematch\""; break; case "beginning": search = "\"lucenematch " + expression + "\""; break; case "middle": search = "\"" + expression + "\""; break; default: if (searchDeinflected) { StringBuilder sb = new StringBuilder("\"lucenematch " + expression + "lucenematch\""); for (Predicate predicate : Deconjugator.deconjugate(hiragana)) { if (predicate.isSuru()) { sb.append(" OR ").append("(\"lucenematch ") .append(insertSpaces(predicate.getPredicate())) .append("lucenematch\" AND (pos:vs OR pos:vs-c OR pos:vs-s OR pos:vs-i))"); } else if (predicate.isKuru()) { sb.append(" OR ").append("(\"lucenematch ") .append(insertSpaces(predicate.getPredicate())) .append("lucenematch\" AND pos:vk)"); } else if (predicate.isIku()) { sb.append(" OR ").append("(\"lucenematch ") .append(insertSpaces(predicate.getPredicate())) .append("lucenematch\" AND pos:v5k-s)"); } else if (predicate.isIAdjective()) { sb.append(" OR ").append("(\"lucenematch ") .append(insertSpaces(predicate.getPredicate())) .append("lucenematch\" AND pos:adj-i)"); } else sb.append(" OR ").append("(\"lucenematch ") .append(insertSpaces(predicate.getPredicate())) .append("lucenematch\" AND (pos:v1 OR pos:v2 OR pos:v5 OR pos:vz OR pos:vi OR pos:vn OR pos:vr))"); } search = sb.toString(); } else { search = "\"lucenematch " + expression + "lucenematch\""; } } Log.i(LOG_TAG, " Searching for: " + search); Query q; if (onlyReb) { q = (new QueryParser(Version.LUCENE_36, "index_japanese_reb", analyzer)).parse(search); } else { StandardQueryParser parser = new StandardQueryParser(analyzer); q = parser.parse(search, "japanese"); } Directory dir = FSDirectory.open(file); reader = IndexReader.open(dir); final IndexSearcher searcher = new IndexSearcher(reader); Collector collector = new Collector() { int max = 1000; int count = 0; private int docBase; @Override public boolean acceptsDocsOutOfOrder() { return true; } @Override public void collect(int docID) throws IOException { Document d = searcher.doc(docID + docBase); Translation translation = new Translation(); String prioritized = d.get("prioritized"); if (searchOnlyFavorised && prioritized == null) { return; } if (prioritized != null) { //is prioritized translation.setPrioritized(true); } String ruby = d.get("ruby"); if (ruby != null && ruby.length() > 0) { translation.setRuby(ruby); } String japanese_keb = d.get("japanese_keb"); if (japanese_keb != null && japanese_keb.length() != 0) { translation.parseJapaneseKeb(japanese_keb); } String japanese_reb = d.get("japanese_reb"); if (japanese_reb != null && japanese_reb.length() != 0) { translation.parseJapaneseReb(japanese_reb); } String english = d.get("english"); if (english != null && english.length() != 0) { translation.parseEnglish(english); } String french = d.get("french"); if (french != null && french.length() != 0) { translation.parseFrench(french); } String dutch = d.get("dutch"); if (dutch != null && dutch.length() != 0) { translation.parseDutch(dutch); } String german = d.get("german"); if (german != null && german.length() != 0) { translation.parseGerman(german); } String russian = d.get("russian"); if (russian != null && russian.length() != 0) { translation.parseRussian(russian); } if ((englishBool && translation.getEnglishSense() != null) || (dutchBool && translation.getDutchSense() != null) || (germanBool && translation.getGermanSense() != null) || (frenchBool && translation.getFrenchSense() != null) || (russianBool && translation.getRussianSense() != null)) { count++; if (count < max) { if (!FragmentListAsyncTask.this.isCancelled()) { FragmentListAsyncTask.this.publishProgress(translation); translations.add(translation); } else { translations.clear(); throw new IOException("Loader canceled"); } } else { throw new IOException("Max exceeded"); } } } @Override public void setNextReader(IndexReader reader, int docBas) throws IOException { docBase = docBas; } @Override public void setScorer(Scorer arg0) throws IOException { } }; searcher.search(q, collector); reader.close(); } catch (IOException ex) { Log.e(LOG_TAG, "IO Exception: " + ex.toString()); return translations; } catch (Exception ex) { Log.e(LOG_TAG, "Exception: " + ex.toString()); return null; } return translations.isEmpty() ? null : translations; }
From source file:cz.muni.fi.japanesejmdictsaxparser.saxholder.SaxDataHolder.java
License:Open Source License
/** * SaxDataHolder constructor/*from ww w . j a v a2 s . co m*/ * * @param androidOutputFolder lucene dictionary for saving documents * @throws IOException * @throws IllegalArgumentException if directory doesn't exist */ public SaxDataHolder(File androidOutputFolder) throws IOException, IllegalArgumentException { if (androidOutputFolder == null) { log.debug(LOG_TAG + "SaxDataHolder - android dictionary directory is null"); throw new IllegalArgumentException("SaxDataHolder: android dictionary directory is null"); } Directory dir = FSDirectory.open(androidOutputFolder); Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_36); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer); mWriter = new IndexWriter(dir, config); log.debug(LOG_TAG + "SaxDataHolder created"); }
From source file:cz.muni.fi.japanesejmdictsaxparser.saxholder.SaxKanjidic2Holder.java
License:Open Source License
/** * SaxDataHolderKanjiDict constructor/*from w w w . jav a2s .c o m*/ * * @param androidOutputFolder lucene dictionary for saving documents * @throws IOException * @throws IllegalArgumentException if directory doesn't exist */ public SaxKanjidic2Holder(File androidOutputFolder) throws IOException, IllegalArgumentException { if (androidOutputFolder == null) { log.debug(LOG_TAG + "SaxDataHolderKanjiDict - dictionary directory is null"); throw new IllegalArgumentException("SaxDataHolderKanjiDict: dictionary directory is null"); } Directory dir = FSDirectory.open(androidOutputFolder); Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_36); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer); mWriter = new IndexWriter(dir, config); // windows log.debug(LOG_TAG + "SaxDataHolderKanjiDict created"); }
From source file:framework.retrieval.engine.analyzer.impl.CJKAnalyzerBuilder.java
License:Apache License
/** * ?? * @return */ public Analyzer createIndexAnalyzer() { return new CJKAnalyzer(luceneVersion); }
From source file:framework.retrieval.engine.analyzer.impl.CJKAnalyzerBuilder.java
License:Apache License
/** * ??? * @return */ public Analyzer createQueryAnalyzer() { return new CJKAnalyzer(luceneVersion); }
From source file:jp.mwsoft.cjkanalyzers.CJKAnalyzerNoSplitKatakana.java
License:Apache License
public static void main(String[] args) throws Exception { Set<String> stopWords = new HashSet<String>(); stopWords.add("??"); stopWords.add("??"); java.io.StringReader reader = new java.io.StringReader("??????"); CJKAnalyzer analyzer = new CJKAnalyzer(Version.LUCENE_35); TokenStream stream = analyzer.tokenStream("test", reader); for (int i = 0; i < 10; i++) { stream.incrementToken();/* w ww . j a v a2 s . c o m*/ System.out.println(stream); } }
From source file:luceneexamples.JapaneseSearch.java
License:Apache License
@Test public void index() throws Exception { Directory directory = new RAMDirectory(); // Directory directory = FSDirectory.open(new File("cjkindex")); Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); IndexWriter writer = new IndexWriter(directory, iwc); Document doc = new Document(); doc.add(new Field("str_field", "quick brown fox jumped over the lazy dog.", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc);/*w ww. j a v a 2 s. c o m*/ Document doc2 = new Document(); doc2.add(new Field("str_field", "?????", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc2); writer.close(); IndexSearcher searcher = new IndexSearcher(directory, true); QueryParser parser = new QueryParser(Version.LUCENE_31, "str_field", analyzer); TopDocs td = searcher.search(parser.parse(""), 1000); assertThat(td.totalHits, is(1)); searcher.close(); directory.close(); }
From source file:org.apache.jackrabbit.core.query.lucene.LanguageCustomizingAnalyzerRegistry.java
License:Open Source License
public LanguageCustomizingAnalyzerRegistry(IndexingConfiguration configuration) { this.configuration = configuration; languageToAnalyzer.put("ar", new AnalyzerWrapper(new ArabicAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("br", new AnalyzerWrapper(new BrazilianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("cjk", new AnalyzerWrapper(new CJKAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("cn", new AnalyzerWrapper(new ChineseAnalyzer(), true)); languageToAnalyzer.put("cz", new AnalyzerWrapper(new CzechAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("de", new AnalyzerWrapper(new GermanAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("el", new AnalyzerWrapper(new GreekAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("en", new AnalyzerWrapper( new SnowballAnalyzer(Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET), true)); languageToAnalyzer.put("fa", new AnalyzerWrapper(new PersianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("fr", new AnalyzerWrapper(new FrenchAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("nl", new AnalyzerWrapper(new DutchAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("ru", new AnalyzerWrapper(new RussianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("th", new AnalyzerWrapper(new ThaiAnalyzer(Version.LUCENE_30), true)); }