Example usage for org.apache.lucene.analysis.cjk CJKAnalyzer CJKAnalyzer

List of usage examples for org.apache.lucene.analysis.cjk CJKAnalyzer CJKAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.cjk CJKAnalyzer CJKAnalyzer.

Prototype

public CJKAnalyzer(CharArraySet stopwords) 

Source Link

Document

Builds an analyzer with the given stop words

Usage

From source file:com.bigdata.search.DefaultAnalyzerFactory.java

License:Open Source License

/**
 * Initializes the various kinds of analyzers that we know about.
 * <p>/*from   ww  w .ja  v a  2 s .  com*/
 * Note: Each {@link Analyzer} is registered under both the 3 letter and the
 * 2 letter language codes. See <a
 * href="http://www.loc.gov/standards/iso639-2/php/code_list.php">ISO 639-2</a>.
 * 
 * @todo get some informed advice on which {@link Analyzer}s map onto which
 *       language codes.
 * 
 * @todo thread safety? Analyzers produce token processors so maybe there is
 *       no problem here once things are initialized. If so, maybe this
 *       could be static.
 * 
 * @todo configuration. Could be configured by a file containing a class
 *       name and a list of codes that are handled by that class.
 * 
 * @todo strip language code down to 2/3 characters during lookup.
 * 
 * @todo There are a lot of pidgins based on french, english, and other
 *       languages that are not being assigned here.
 */
synchronized private Map<String, AnalyzerConstructor> getAnalyzers() {

    if (analyzers != null) {

        return analyzers;

    }

    analyzers = new HashMap<String, AnalyzerConstructor>();

    final Set<?> emptyStopwords = Collections.EMPTY_SET;

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new BrazilianAnalyzer(Version.LUCENE_CURRENT)
                        : new BrazilianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("por", a);
        analyzers.put("pt", a);
    }

    /*
     * Claims to handle Chinese. Does single character extraction. Claims to
     * produce smaller indices as a result.
     * 
     * Note: you can not tokenize with the Chinese analyzer and the do
     * search using the CJK analyzer and visa versa.
     * 
     * Note: I have no idea whether this would work for Japanese and Korean
     * as well. I expect so, but no real clue.
     */
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return new ChineseAnalyzer();
            }
        };
        analyzers.put("zho", a);
        analyzers.put("chi", a);
        analyzers.put("zh", a);
    }

    /*
     * Claims to handle Chinese, Japanese, Korean. Does double character
     * extraction with overlap.
     */
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new CJKAnalyzer(Version.LUCENE_CURRENT)
                        : new CJKAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        //            analyzers.put("zho", a);
        //            analyzers.put("chi", a);
        //            analyzers.put("zh", a);
        analyzers.put("jpn", a);
        analyzers.put("ja", a);
        analyzers.put("jpn", a);
        analyzers.put("kor", a);
        analyzers.put("ko", a);
    }

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new CzechAnalyzer(Version.LUCENE_CURRENT)
                        : new CzechAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("ces", a);
        analyzers.put("cze", a);
        analyzers.put("cs", a);
    }

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new DutchAnalyzer(Version.LUCENE_CURRENT)
                        : new DutchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("dut", a);
        analyzers.put("nld", a);
        analyzers.put("nl", a);
    }

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new FrenchAnalyzer(Version.LUCENE_CURRENT)
                        : new FrenchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("fra", a);
        analyzers.put("fre", a);
        analyzers.put("fr", a);
    }

    /*
     * Note: There are a lot of language codes for German variants that
     * might be useful here.
     */
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new GermanAnalyzer(Version.LUCENE_CURRENT)
                        : new GermanAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("deu", a);
        analyzers.put("ger", a);
        analyzers.put("de", a);
    }

    // Note: ancient greek has a different code (grc).
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new GreekAnalyzer(Version.LUCENE_CURRENT)
                        : new GreekAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("gre", a);
        analyzers.put("ell", a);
        analyzers.put("el", a);
    }

    // @todo what about other Cyrillic scripts?
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new RussianAnalyzer(Version.LUCENE_CURRENT)
                        : new RussianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("rus", a);
        analyzers.put("ru", a);
    }

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return new ThaiAnalyzer(Version.LUCENE_CURRENT);
            }
        };
        analyzers.put("tha", a);
        analyzers.put("th", a);
    }

    // English
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new StandardAnalyzer(Version.LUCENE_CURRENT)
                        : new StandardAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("eng", a);
        analyzers.put("en", a);
        /*
         * Note: There MUST be an entry under the empty string (""). This
         * entry will be requested when there is no entry for the specified
         * language code.
         */
        analyzers.put("", a);
    }

    return analyzers;

}

From source file:cz.muni.fi.japanesedictionary.engine.CharacterLoader.java

License:Open Source License

/**
 * Searchs for japanese characters in KanjiDict2.
 * //from w w  w.jav  a2s  .  co  m
 * @param params string which contains characters
 * @return Map<String, JapaneseCharacter> if some characters were found returns map else null
 */
@SuppressWarnings("MalformedRegex")
@Override
protected Map<String, JapaneseCharacter> doInBackground(String... params) {
    String characterList = params[0];
    if (characterList == null || characterList.length() < 1) {
        return null;
    }

    SharedPreferences settings = mContext.getSharedPreferences(ParserService.DICTIONARY_PREFERENCES, 0);
    String pathToDictionary = settings.getString(Const.PREF_KANJIDIC_PATH, null);
    if (pathToDictionary == null) {
        Log.e(LOG_TAG, "No path to kanjidict2 dictionary");
        return null;
    }
    File file = new File(pathToDictionary);
    if (file == null || !file.exists() || !file.canRead()) {
        Log.e(LOG_TAG, "Can't read dictionary directory");
        return null;
    }
    StringBuilder searchBuilder = new StringBuilder();
    final int characterListSize = characterList.length();
    // search string
    for (int i = 0; i < characterListSize; i++) {
        String character = String.valueOf(characterList.charAt(i));
        if (Pattern.matches("\\p{Han}", character)) {
            if (i > 0) { //searchBuilder.length() > 0
                searchBuilder.append(' '); // in lucene space serve as OR
            }
            searchBuilder.append('"').append(character).append('"');
        }
    }
    String search = searchBuilder.toString();
    if (search.length() == 0) {
        return null;
    }

    Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_36);
    try {
        QueryParser query = new QueryParser(Version.LUCENE_36, "literal", analyzer);
        query.setPhraseSlop(0);

        Query q = query.parse(search);
        if (mSearcher == null) {
            Directory dir = FSDirectory.open(file);
            IndexReader reader = IndexReader.open(dir);
            mSearcher = new IndexSearcher(reader);
        }
        TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);
        mSearcher.search(q, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;

        Map<String, JapaneseCharacter> result = new HashMap<>();
        for (ScoreDoc document : hits) {
            int docId = document.doc;
            Document d = mSearcher.doc(docId);

            JapaneseCharacter japanCharacter = new JapaneseCharacter();
            String literal = d.get("literal");
            if (literal != null && literal.length() > 0) {
                japanCharacter.setLiteral(literal);
            }
            String radicalClassic = d.get("radicalClassic");
            if (radicalClassic != null && radicalClassic.length() > 0) {
                try {
                    int radicalClassicInt = Integer.parseInt(radicalClassic);
                    if (radicalClassicInt > 0) {
                        japanCharacter.setRadicalClassic(radicalClassicInt);
                    }
                } catch (NumberFormatException ex) {
                    Log.w(LOG_TAG, "Couldn't parse radical-classical: " + radicalClassic);
                }
            }
            String grade = d.get("grade");
            if (grade != null && grade.length() > 0) {
                try {
                    int gradeInt = Integer.parseInt(grade);
                    if (gradeInt > 0) {
                        japanCharacter.setGrade(gradeInt);
                    }
                } catch (NumberFormatException ex) {
                    Log.w(LOG_TAG, "Couldn't parse grade: " + grade);
                }
            }
            String strokeCount = d.get("strokeCount");
            if (strokeCount != null && strokeCount.length() > 0) {
                try {
                    int strokeCountInt = Integer.parseInt(strokeCount);
                    if (strokeCountInt > 0) {
                        japanCharacter.setStrokeCount(strokeCountInt);
                    }
                } catch (NumberFormatException ex) {
                    Log.w(LOG_TAG, "Couldn't parse strokeCount: " + strokeCount);
                }
            }

            String skip = d.get("queryCodeSkip");
            if (skip != null && skip.length() > 0) {
                japanCharacter.setSkip(skip);
            }

            String dicRef = d.get("dicRef");
            if (dicRef != null && dicRef.length() > 0) {
                japanCharacter.parseDicRef(dicRef);
            }

            String rmGroupJaOn = d.get("rmGroupJaOn");
            if (rmGroupJaOn != null && rmGroupJaOn.length() > 0) {
                japanCharacter.parseRmGroupJaOn(rmGroupJaOn);
            }

            String rmGroupJaKun = d.get("rmGroupJaKun");
            if (rmGroupJaKun != null && rmGroupJaKun.length() > 0) {
                japanCharacter.parseRmGroupJaKun(rmGroupJaKun);
            }

            String meaningEnglish = d.get("meaningEnglish");
            if (meaningEnglish != null && meaningEnglish.length() > 0) {
                japanCharacter.parseMeaningEnglish(meaningEnglish);
            }

            String meaningFrench = d.get("meaningFrench");
            if (meaningFrench != null && meaningFrench.length() > 0) {
                japanCharacter.parseMeaningFrench(meaningFrench);
            }

            String meaningDutch = d.get("meaningDutch");
            if (meaningDutch != null && meaningDutch.length() > 0) {
                japanCharacter.parseMeaningDutch(meaningDutch);
            }

            String meaningGerman = d.get("meaningGerman");
            if (meaningGerman != null && meaningGerman.length() > 0) {
                japanCharacter.parseMeaningGerman(meaningGerman);
            }

            String meaningRussian = d.get("meaningRussian");
            if (meaningRussian != null && meaningRussian.length() > 0) {
                japanCharacter.parseMeaningRussian(meaningRussian);
            }

            String nanori = d.get("nanori");
            if (nanori != null && nanori.length() > 0) {
                japanCharacter.parseNanori(nanori);
            }
            if (japanCharacter.getLiteral() != null && japanCharacter.getLiteral().length() > 0) {
                result.put(japanCharacter.getLiteral(), japanCharacter);
            }
        }
        return result.size() > 0 ? result : null;

    } catch (ParseException ex) {
        Log.e(LOG_TAG, "Searching for charaters ParseException caught: " + ex);
    } catch (IOException ex) {
        Log.e(LOG_TAG, "Searching for charaters IOException caught: " + ex);
    } catch (Exception ex) {
        Log.e(LOG_TAG, "Searching for charaters Exception caught: " + ex);
    }

    return null;
}

From source file:cz.muni.fi.japanesedictionary.engine.FragmentListAsyncTask.java

License:Open Source License

/**
 * Loads translation using Lucene/*from  www  .  j av a 2s .  c om*/
 */
@Override
protected List<Translation> doInBackground(String... params) {
    String expression = params[0];
    String part = params[1];

    SharedPreferences settings = mContext.getSharedPreferences(ParserService.DICTIONARY_PREFERENCES, 0);
    String pathToDictionary = settings.getString(Const.PREF_JMDICT_PATH, null);
    SharedPreferences sharedPrefs = PreferenceManager.getDefaultSharedPreferences(mContext);
    final boolean englishBool = sharedPrefs.getBoolean("language_english", false);
    final boolean frenchBool = sharedPrefs.getBoolean("language_french", false);
    final boolean dutchBool = sharedPrefs.getBoolean("language_dutch", false);
    final boolean germanBool = sharedPrefs.getBoolean("language_german", false);
    final boolean russianBool = sharedPrefs.getBoolean("language_russian", false);
    final boolean searchOnlyFavorised = sharedPrefs.getBoolean("search_only_favorite", false);
    final boolean searchDeinflected = sharedPrefs.getBoolean("search_deinflected", false);

    final List<Translation> translations = new ArrayList<>();

    if (expression == null) {
        // first run
        Log.i(LOG_TAG, "First run - last 10 translations ");
        GlossaryReaderContract database = new GlossaryReaderContract(mContext);
        List<Translation> translationsTemp = database.getLastTranslations(10);
        database.close();
        return translationsTemp;
    }

    if (pathToDictionary == null) {
        Log.e(LOG_TAG, "No path to jmdict dictionary");
        return null;
    }
    File file = new File(pathToDictionary);
    if (!file.exists() || !file.canRead()) {
        Log.e(LOG_TAG, "Can't read jmdict dictionary directory");
        return null;
    }

    if (expression.length() < 1) {
        Log.w(LOG_TAG, "No expression to translate");
        return null;
    }
    Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_36);

    IndexReader reader;
    try {
        final String search;
        final String hiragana;
        boolean onlyReb = false;

        if (Pattern.matches("\\p{Latin}*", expression)) {
            // only romaji
            onlyReb = true;
            Log.i(LOG_TAG, "Only latin letters, converting to hiragana. ");
            expression = TranscriptionConverter.kunreiToHepburn(expression);
            expression = RomanizationEnum.Hepburn.toHiragana(expression);
        }
        hiragana = expression;

        expression = insertSpaces(expression);

        switch (part) {
        case "end":
            search = "\"" + expression + "lucenematch\"";
            break;
        case "beginning":
            search = "\"lucenematch " + expression + "\"";
            break;
        case "middle":
            search = "\"" + expression + "\"";
            break;
        default:
            if (searchDeinflected) {
                StringBuilder sb = new StringBuilder("\"lucenematch " + expression + "lucenematch\"");
                for (Predicate predicate : Deconjugator.deconjugate(hiragana)) {
                    if (predicate.isSuru()) {
                        sb.append(" OR ").append("(\"lucenematch ")
                                .append(insertSpaces(predicate.getPredicate()))
                                .append("lucenematch\" AND (pos:vs OR pos:vs-c OR pos:vs-s OR pos:vs-i))");
                    } else if (predicate.isKuru()) {
                        sb.append(" OR ").append("(\"lucenematch ")
                                .append(insertSpaces(predicate.getPredicate()))
                                .append("lucenematch\" AND pos:vk)");
                    } else if (predicate.isIku()) {
                        sb.append(" OR ").append("(\"lucenematch ")
                                .append(insertSpaces(predicate.getPredicate()))
                                .append("lucenematch\" AND pos:v5k-s)");
                    } else if (predicate.isIAdjective()) {
                        sb.append(" OR ").append("(\"lucenematch ")
                                .append(insertSpaces(predicate.getPredicate()))
                                .append("lucenematch\" AND pos:adj-i)");
                    } else
                        sb.append(" OR ").append("(\"lucenematch ")
                                .append(insertSpaces(predicate.getPredicate()))
                                .append("lucenematch\" AND (pos:v1 OR pos:v2 OR pos:v5 OR pos:vz OR pos:vi OR pos:vn OR pos:vr))");
                }
                search = sb.toString();
            } else {
                search = "\"lucenematch " + expression + "lucenematch\"";
            }
        }
        Log.i(LOG_TAG, " Searching for: " + search);

        Query q;
        if (onlyReb) {
            q = (new QueryParser(Version.LUCENE_36, "index_japanese_reb", analyzer)).parse(search);
        } else {
            StandardQueryParser parser = new StandardQueryParser(analyzer);
            q = parser.parse(search, "japanese");
        }

        Directory dir = FSDirectory.open(file);
        reader = IndexReader.open(dir);
        final IndexSearcher searcher = new IndexSearcher(reader);
        Collector collector = new Collector() {
            int max = 1000;
            int count = 0;
            private int docBase;

            @Override
            public boolean acceptsDocsOutOfOrder() {
                return true;
            }

            @Override
            public void collect(int docID) throws IOException {
                Document d = searcher.doc(docID + docBase);
                Translation translation = new Translation();
                String prioritized = d.get("prioritized");
                if (searchOnlyFavorised && prioritized == null) {
                    return;
                }
                if (prioritized != null) {
                    //is prioritized
                    translation.setPrioritized(true);
                }

                String ruby = d.get("ruby");

                if (ruby != null && ruby.length() > 0) {
                    translation.setRuby(ruby);
                }

                String japanese_keb = d.get("japanese_keb");
                if (japanese_keb != null && japanese_keb.length() != 0) {
                    translation.parseJapaneseKeb(japanese_keb);
                }

                String japanese_reb = d.get("japanese_reb");
                if (japanese_reb != null && japanese_reb.length() != 0) {
                    translation.parseJapaneseReb(japanese_reb);
                }

                String english = d.get("english");
                if (english != null && english.length() != 0) {
                    translation.parseEnglish(english);
                }

                String french = d.get("french");
                if (french != null && french.length() != 0) {
                    translation.parseFrench(french);
                }

                String dutch = d.get("dutch");
                if (dutch != null && dutch.length() != 0) {
                    translation.parseDutch(dutch);
                }

                String german = d.get("german");
                if (german != null && german.length() != 0) {
                    translation.parseGerman(german);
                }

                String russian = d.get("russian");
                if (russian != null && russian.length() != 0) {
                    translation.parseRussian(russian);
                }

                if ((englishBool && translation.getEnglishSense() != null)
                        || (dutchBool && translation.getDutchSense() != null)
                        || (germanBool && translation.getGermanSense() != null)
                        || (frenchBool && translation.getFrenchSense() != null)
                        || (russianBool && translation.getRussianSense() != null)) {

                    count++;
                    if (count < max) {
                        if (!FragmentListAsyncTask.this.isCancelled()) {
                            FragmentListAsyncTask.this.publishProgress(translation);
                            translations.add(translation);
                        } else {
                            translations.clear();
                            throw new IOException("Loader canceled");
                        }
                    } else {
                        throw new IOException("Max exceeded");
                    }
                }
            }

            @Override
            public void setNextReader(IndexReader reader, int docBas) throws IOException {
                docBase = docBas;
            }

            @Override
            public void setScorer(Scorer arg0) throws IOException {
            }

        };

        searcher.search(q, collector);
        reader.close();
    } catch (IOException ex) {
        Log.e(LOG_TAG, "IO Exception:  " + ex.toString());
        return translations;
    } catch (Exception ex) {
        Log.e(LOG_TAG, "Exception: " + ex.toString());
        return null;
    }

    return translations.isEmpty() ? null : translations;
}

From source file:cz.muni.fi.japanesejmdictsaxparser.saxholder.SaxDataHolder.java

License:Open Source License

/**
 * SaxDataHolder constructor/*from  ww w .  j  a  v  a2  s  . co m*/
 * 
 * @param androidOutputFolder lucene dictionary for saving documents
 * @throws IOException
 * @throws IllegalArgumentException if directory doesn't exist
 */
public SaxDataHolder(File androidOutputFolder) throws IOException, IllegalArgumentException {
    if (androidOutputFolder == null) {
        log.debug(LOG_TAG + "SaxDataHolder - android dictionary directory is null");
        throw new IllegalArgumentException("SaxDataHolder: android dictionary directory is null");
    }
    Directory dir = FSDirectory.open(androidOutputFolder);
    Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_36);
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
    mWriter = new IndexWriter(dir, config);
    log.debug(LOG_TAG + "SaxDataHolder created");

}

From source file:cz.muni.fi.japanesejmdictsaxparser.saxholder.SaxKanjidic2Holder.java

License:Open Source License

/**
 * SaxDataHolderKanjiDict constructor/*from w w  w .  jav  a2s  .c o  m*/
 * 
 * @param androidOutputFolder lucene dictionary for saving documents
 * @throws IOException
 * @throws IllegalArgumentException if directory doesn't exist
 */
public SaxKanjidic2Holder(File androidOutputFolder) throws IOException, IllegalArgumentException {
    if (androidOutputFolder == null) {
        log.debug(LOG_TAG + "SaxDataHolderKanjiDict - dictionary directory is null");
        throw new IllegalArgumentException("SaxDataHolderKanjiDict: dictionary directory is null");
    }
    Directory dir = FSDirectory.open(androidOutputFolder);
    Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_36);
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
    mWriter = new IndexWriter(dir, config);
    // windows
    log.debug(LOG_TAG + "SaxDataHolderKanjiDict created");
}

From source file:framework.retrieval.engine.analyzer.impl.CJKAnalyzerBuilder.java

License:Apache License

/**
 * ??
 * @return
 */
public Analyzer createIndexAnalyzer() {
    return new CJKAnalyzer(luceneVersion);
}

From source file:framework.retrieval.engine.analyzer.impl.CJKAnalyzerBuilder.java

License:Apache License

/**
 * ???
 * @return
 */
public Analyzer createQueryAnalyzer() {
    return new CJKAnalyzer(luceneVersion);
}

From source file:jp.mwsoft.cjkanalyzers.CJKAnalyzerNoSplitKatakana.java

License:Apache License

public static void main(String[] args) throws Exception {

    Set<String> stopWords = new HashSet<String>();
    stopWords.add("??");
    stopWords.add("??");

    java.io.StringReader reader = new java.io.StringReader("??????");

    CJKAnalyzer analyzer = new CJKAnalyzer(Version.LUCENE_35);
    TokenStream stream = analyzer.tokenStream("test", reader);

    for (int i = 0; i < 10; i++) {
        stream.incrementToken();/*  w  ww .  j  a  v a2 s .  c o m*/
        System.out.println(stream);
    }
}

From source file:luceneexamples.JapaneseSearch.java

License:Apache License

@Test
public void index() throws Exception {
    Directory directory = new RAMDirectory();
    //        Directory directory = FSDirectory.open(new File("cjkindex"));
    Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_31);

    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
    IndexWriter writer = new IndexWriter(directory, iwc);

    Document doc = new Document();
    doc.add(new Field("str_field", "quick brown fox jumped over the lazy dog.", Field.Store.YES,
            Field.Index.ANALYZED));
    writer.addDocument(doc);/*w ww.  j a  v  a  2 s.  c o  m*/
    Document doc2 = new Document();
    doc2.add(new Field("str_field", "?????", Field.Store.YES,
            Field.Index.ANALYZED));
    writer.addDocument(doc2);
    writer.close();
    IndexSearcher searcher = new IndexSearcher(directory, true);
    QueryParser parser = new QueryParser(Version.LUCENE_31, "str_field", analyzer);
    TopDocs td = searcher.search(parser.parse(""), 1000);
    assertThat(td.totalHits, is(1));
    searcher.close();
    directory.close();
}

From source file:org.apache.jackrabbit.core.query.lucene.LanguageCustomizingAnalyzerRegistry.java

License:Open Source License

public LanguageCustomizingAnalyzerRegistry(IndexingConfiguration configuration) {
    this.configuration = configuration;

    languageToAnalyzer.put("ar", new AnalyzerWrapper(new ArabicAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("br", new AnalyzerWrapper(new BrazilianAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("cjk", new AnalyzerWrapper(new CJKAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("cn", new AnalyzerWrapper(new ChineseAnalyzer(), true));
    languageToAnalyzer.put("cz", new AnalyzerWrapper(new CzechAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("de", new AnalyzerWrapper(new GermanAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("el", new AnalyzerWrapper(new GreekAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("en", new AnalyzerWrapper(
            new SnowballAnalyzer(Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET), true));
    languageToAnalyzer.put("fa", new AnalyzerWrapper(new PersianAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("fr", new AnalyzerWrapper(new FrenchAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("nl", new AnalyzerWrapper(new DutchAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("ru", new AnalyzerWrapper(new RussianAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("th", new AnalyzerWrapper(new ThaiAnalyzer(Version.LUCENE_30), true));
}