Example usage for org.apache.lucene.analysis.el GreekAnalyzer GreekAnalyzer

List of usage examples for org.apache.lucene.analysis.el GreekAnalyzer GreekAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.el GreekAnalyzer GreekAnalyzer.

Prototype

public GreekAnalyzer(CharArraySet stopwords) 

Source Link

Document

Builds an analyzer with the given stop words.

Usage

From source file:com.bigdata.search.DefaultAnalyzerFactory.java

License:Open Source License

/**
 * Initializes the various kinds of analyzers that we know about.
 * <p>/*w  w  w .  j a  v a 2  s . c o m*/
 * Note: Each {@link Analyzer} is registered under both the 3 letter and the
 * 2 letter language codes. See <a
 * href="http://www.loc.gov/standards/iso639-2/php/code_list.php">ISO 639-2</a>.
 * 
 * @todo get some informed advice on which {@link Analyzer}s map onto which
 *       language codes.
 * 
 * @todo thread safety? Analyzers produce token processors so maybe there is
 *       no problem here once things are initialized. If so, maybe this
 *       could be static.
 * 
 * @todo configuration. Could be configured by a file containing a class
 *       name and a list of codes that are handled by that class.
 * 
 * @todo strip language code down to 2/3 characters during lookup.
 * 
 * @todo There are a lot of pidgins based on french, english, and other
 *       languages that are not being assigned here.
 */
synchronized private Map<String, AnalyzerConstructor> getAnalyzers() {

    if (analyzers != null) {

        return analyzers;

    }

    analyzers = new HashMap<String, AnalyzerConstructor>();

    final Set<?> emptyStopwords = Collections.EMPTY_SET;

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new BrazilianAnalyzer(Version.LUCENE_CURRENT)
                        : new BrazilianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("por", a);
        analyzers.put("pt", a);
    }

    /*
     * Claims to handle Chinese. Does single character extraction. Claims to
     * produce smaller indices as a result.
     * 
     * Note: you can not tokenize with the Chinese analyzer and the do
     * search using the CJK analyzer and visa versa.
     * 
     * Note: I have no idea whether this would work for Japanese and Korean
     * as well. I expect so, but no real clue.
     */
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return new ChineseAnalyzer();
            }
        };
        analyzers.put("zho", a);
        analyzers.put("chi", a);
        analyzers.put("zh", a);
    }

    /*
     * Claims to handle Chinese, Japanese, Korean. Does double character
     * extraction with overlap.
     */
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new CJKAnalyzer(Version.LUCENE_CURRENT)
                        : new CJKAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        //            analyzers.put("zho", a);
        //            analyzers.put("chi", a);
        //            analyzers.put("zh", a);
        analyzers.put("jpn", a);
        analyzers.put("ja", a);
        analyzers.put("jpn", a);
        analyzers.put("kor", a);
        analyzers.put("ko", a);
    }

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new CzechAnalyzer(Version.LUCENE_CURRENT)
                        : new CzechAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("ces", a);
        analyzers.put("cze", a);
        analyzers.put("cs", a);
    }

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new DutchAnalyzer(Version.LUCENE_CURRENT)
                        : new DutchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("dut", a);
        analyzers.put("nld", a);
        analyzers.put("nl", a);
    }

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new FrenchAnalyzer(Version.LUCENE_CURRENT)
                        : new FrenchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("fra", a);
        analyzers.put("fre", a);
        analyzers.put("fr", a);
    }

    /*
     * Note: There are a lot of language codes for German variants that
     * might be useful here.
     */
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new GermanAnalyzer(Version.LUCENE_CURRENT)
                        : new GermanAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("deu", a);
        analyzers.put("ger", a);
        analyzers.put("de", a);
    }

    // Note: ancient greek has a different code (grc).
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new GreekAnalyzer(Version.LUCENE_CURRENT)
                        : new GreekAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("gre", a);
        analyzers.put("ell", a);
        analyzers.put("el", a);
    }

    // @todo what about other Cyrillic scripts?
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new RussianAnalyzer(Version.LUCENE_CURRENT)
                        : new RussianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("rus", a);
        analyzers.put("ru", a);
    }

    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return new ThaiAnalyzer(Version.LUCENE_CURRENT);
            }
        };
        analyzers.put("tha", a);
        analyzers.put("th", a);
    }

    // English
    {
        AnalyzerConstructor a = new AnalyzerConstructor() {
            public Analyzer newInstance(final boolean filterStopwords) {
                return filterStopwords ? new StandardAnalyzer(Version.LUCENE_CURRENT)
                        : new StandardAnalyzer(Version.LUCENE_CURRENT, emptyStopwords);
            }
        };
        analyzers.put("eng", a);
        analyzers.put("en", a);
        /*
         * Note: There MUST be an entry under the empty string (""). This
         * entry will be requested when there is no entry for the specified
         * language code.
         */
        analyzers.put("", a);
    }

    return analyzers;

}

From source file:com.bizosys.hsearch.inpipe.TokenizeNonEnglish.java

License:Apache License

public void init(Configuration conf) throws ApplicationFault, SystemFault {
    languageMap.put("br", new BrazilianAnalyzer(LuceneConstants.version));
    languageMap.put("cz", new CzechAnalyzer(LuceneConstants.version));
    languageMap.put("nl", new DutchAnalyzer(LuceneConstants.version));
    languageMap.put("fr", new FrenchAnalyzer(LuceneConstants.version));
    languageMap.put("de", new GermanAnalyzer(LuceneConstants.version));
    languageMap.put("el", new GreekAnalyzer(LuceneConstants.version));
    languageMap.put("ru", new RussianAnalyzer(LuceneConstants.version));
    languageMap.put("th", new ThaiAnalyzer(LuceneConstants.version));
}

From source file:org.apache.jackrabbit.core.query.lucene.LanguageCustomizingAnalyzerRegistry.java

License:Open Source License

public LanguageCustomizingAnalyzerRegistry(IndexingConfiguration configuration) {
    this.configuration = configuration;

    languageToAnalyzer.put("ar", new AnalyzerWrapper(new ArabicAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("br", new AnalyzerWrapper(new BrazilianAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("cjk", new AnalyzerWrapper(new CJKAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("cn", new AnalyzerWrapper(new ChineseAnalyzer(), true));
    languageToAnalyzer.put("cz", new AnalyzerWrapper(new CzechAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("de", new AnalyzerWrapper(new GermanAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("el", new AnalyzerWrapper(new GreekAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("en", new AnalyzerWrapper(
            new SnowballAnalyzer(Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET), true));
    languageToAnalyzer.put("fa", new AnalyzerWrapper(new PersianAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("fr", new AnalyzerWrapper(new FrenchAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("nl", new AnalyzerWrapper(new DutchAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("ru", new AnalyzerWrapper(new RussianAnalyzer(Version.LUCENE_30), true));
    languageToAnalyzer.put("th", new AnalyzerWrapper(new ThaiAnalyzer(Version.LUCENE_30), true));
}

From source file:org.eclipse.help.internal.search.AnalyzerFactory.java

License:Open Source License

public Analyzer create() {
    if (locale == null)
        return null;
    Version version = Version.LUCENE_35;
    if ("pt".equals(locale)) //$NON-NLS-1$
        return new BrazilianAnalyzer(version);
    if ("ja".equals(locale)) //$NON-NLS-1$
        return new CJKAnalyzer(version);
    if ("ko".equals(locale)) //$NON-NLS-1$
        return new CJKAnalyzer(version);
    if ("pt".equals(locale)) //$NON-NLS-1$
        return new BrazilianAnalyzer(version);
    if ("cs".equals(locale)) //$NON-NLS-1$
        return new CzechAnalyzer(version);
    if ("de".equals(locale)) //$NON-NLS-1$
        return new GermanAnalyzer(version);
    if ("el".equals(locale)) //$NON-NLS-1$
        return new GreekAnalyzer(version);
    if ("fr".equals(locale)) //$NON-NLS-1$
        return new FrenchAnalyzer(version);
    if ("nl".equals(locale)) //$NON-NLS-1$
        return new DutchAnalyzer(version);
    if ("ru".equals(locale)) //$NON-NLS-1$
        return new RussianAnalyzer(version);
    //unknown language
    return null;//from   w  w  w. j av a  2s  . co m

}

From source file:org.elasticsearch.analysis.common.GreekAnalyzerProvider.java

License:Apache License

GreekAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new GreekAnalyzer(Analysis.parseStopWords(env, settings, GreekAnalyzer.getDefaultStopSet()));
    analyzer.setVersion(version);//  w  ww .j a  v  a  2 s.  com
}

From source file:org.omegat.tokenizer.LuceneGreekTokenizer.java

License:Open Source License

@Override
protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed,
        final boolean stopWordsAllowed) {
    if (stemsAllowed) {
        GreekAnalyzer an;//  w w  w. j a v  a  2  s  .co m
        if (stopWordsAllowed) {
            an = new GreekAnalyzer(getBehavior());
        } else {
            an = new GreekAnalyzer(getBehavior(), EMPTY_STRING_LIST);
        }
        return an.tokenStream("", new StringReader(strOrig));
    } else {
        return new StandardTokenizer(getBehavior(), new StringReader(strOrig));
    }
}

From source file:org.scify.NewSumServer.Server.Searching.Indexer.java

License:Apache License

/**
 * The Main method of the Indexer Class.
 * Traverses a directory and creates the index files needed for the package to
 * operate.//from  w  w w.  j av a  2  s  .  c om
 * @throws CorruptIndexException
 * @throws LockObtainFailedException
 * @throws IOException
 */
public void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException {
    // The dir containing the Files to Index
    File docDir = new File(this.sFilesPath);
    Directory FSDir = FSDirectory.open(indexDir);
    //init the Analyzer, according to locale
    if (lLoc.toString().equals("el")) {
        anal = new GreekAnalyzer(Version.LUCENE_36);
    } else if (lLoc.toString().equals("en")) {
        // The standard analyzer
        Analyzer stdAnal = new StandardAnalyzer(Version.LUCENE_36);
        // In order to index all the text in a field,
        // however long that field may be
        anal = new LimitTokenCountAnalyzer(stdAnal, Integer.MAX_VALUE);
    }
    // The configuration for the Index Writer
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, anal);
    conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    // The Index Writer
    IndexWriter indexWriter = new IndexWriter(FSDir, conf);
    // For each File in the dir, create a Document
    for (File file : getFilesFromFirstLeverSubdirs(docDir)) {
        String filename = file.getName();
        String fullFileName = file.getAbsolutePath();
        String tmpText = Utilities.readFromFile(fullFileName, " ");
        Document d = new Document(); //lucene Document
        // Add the "filename" field
        d.add(new Field(FILE_FIELD, filename, Field.Store.YES, Field.Index.NOT_ANALYZED));
        // Add The "Text" Field
        d.add(new Field(TEXT_FIELD, tmpText, Field.Store.YES, Field.Index.ANALYZED));

        // Add the Document to the Writer
        indexWriter.addDocument(d);
    }
    int numDocs = indexWriter.numDocs();
    // the index will be merged down into a single segment, resulting in
    // a smaller index with better search performance. Costly Operation,
    // DO NOT USE on large dirs or when low disk space (needs (2-3)*DirSize)
    indexWriter.forceMerge(1);
    // Syncs All referenced Index Files.
    // At this point old indexes will be deleted, freeing up space
    indexWriter.commit();
    // Terminate the Writer appropriately
    indexWriter.close();
    //        LOGGER.log(Level.INFO, "Succesfully closed indexWriter with {0}", anal.toString());
}

From source file:org.scify.NewSumServer.Server.Searching.Searcher.java

License:Apache License

/**
 * Searches the index directory for the specified query.
 * @param fIndexDir The directory where the indexed files are stored
 * @param lLoc The locale that the indexed is created in
 * @param sQuery The search term//w  ww . j a  v a  2s  . c  o  m
 * @param iMaxHits The max number of results to be returned.
 * @return A list of scoredocs which correspond to the search entry
 */
public List<ScoreDoc> searchIndex(File fIndexDir, Locale lLoc, String sQuery, int iMaxHits) {
    try {
        // Open the Directory of the Indexed Files, using
        // the FSDirectory class
        Directory FSDir = FSDirectory.open(fIndexDir);
        // Create the reader class on the Dir
        IndexReader reader = IndexReader.open(FSDir);
        IndexSearcher searcher = new IndexSearcher(reader);
        String dField = "text"; // Pass this from the Indexer Class?
        // Must Use the Same Analyzer as the index Class, otherwise
        // results will be awkward. So it get's analyzer from Indexer class
        // Create the query Parser on the Field that we want to parse
        if (lLoc.toString().equals("el")) {
            anal = new GreekAnalyzer(Version.LUCENE_36);
        } else if (lLoc.toString().equals("en")) {
            // The standard analyzer
            Analyzer stdAnal = new StandardAnalyzer(Version.LUCENE_36);
            anal = new LimitTokenCountAnalyzer(stdAnal, Integer.MAX_VALUE);
        }
        QueryParser parser = new QueryParser(Version.LUCENE_36, dField, anal);
        try {
            Query q = parser.parse(sQuery);
            // Search the Index with the Query
            TopDocs hits = searcher.search(q, iMaxHits);
            ScoreDoc[] scoreDocs = hits.scoreDocs;
            //debug start
            System.out.println("files found: " + scoreDocs.length);
            //debug end
            // Iterate over the scoredocs
            for (int n = 0; n < scoreDocs.length; n++) {
                ScoreDoc sd = scoreDocs[n];
                float score = sd.score;
                int docId = sd.doc;
                Document d = searcher.doc(docId);
                String filename = d.get("file");
                //                    System.out.println //debug
                //                    (filename+": "+"Score: "+score+" - "+ "DocID: "+ docId);
                //Save the <docID, filename> data to the map
                this.docFiles.put(docId, filename);
            }
            // Sort the Docs according to their scores
            List<ScoreDoc> returnList = sortScoreDocs(scoreDocs);
            Collections.reverse(returnList);
            return returnList;
        } catch (ParseException ex) {
            LOGGER.log(Level.SEVERE, "Could not parse query {0}", sQuery);
        } catch (NullPointerException ex) {
            LOGGER.log(Level.WARNING, ex.getMessage());
            return null;
        }
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, "Could not open Directory {0}", fIndexDir.getPath());
    }
    return null;
}

From source file:perLucene.Server.java

License:Open Source License

private static void initAnalyzers() {

    ha = new HashMap<String, Analyzer>();

    ha.put("ar", new ArabicAnalyzer(Version.LUCENE_41));
    ha.put("el", new GreekAnalyzer(Version.LUCENE_41));
    ha.put("bg", new BulgarianAnalyzer(Version.LUCENE_41));
    ha.put("br", new BrazilianAnalyzer(Version.LUCENE_41));
    ha.put("ca", new CatalanAnalyzer(Version.LUCENE_41));
    ha.put("cz", new CzechAnalyzer(Version.LUCENE_41));
    ha.put("da", new DanishAnalyzer(Version.LUCENE_41));
    ha.put("de", new GermanAnalyzer(Version.LUCENE_41));
    ha.put("en", new EnglishAnalyzer(Version.LUCENE_41));
    ha.put("es", new SpanishAnalyzer(Version.LUCENE_41));
    ha.put("eu", new BasqueAnalyzer(Version.LUCENE_41));
    ha.put("fa", new PersianAnalyzer(Version.LUCENE_41));
    ha.put("fi", new FinnishAnalyzer(Version.LUCENE_41));
    ha.put("fr", new FrenchAnalyzer(Version.LUCENE_41));
    ha.put("ga", new IrishAnalyzer(Version.LUCENE_41));
    ha.put("gl", new GalicianAnalyzer(Version.LUCENE_41));
    ha.put("hi", new HindiAnalyzer(Version.LUCENE_41));
    ha.put("hu", new HungarianAnalyzer(Version.LUCENE_41));
    ha.put("hy", new ArmenianAnalyzer(Version.LUCENE_41));
    ha.put("id", new IndonesianAnalyzer(Version.LUCENE_41));
    ha.put("it", new ItalianAnalyzer(Version.LUCENE_41));
    ha.put("lv", new LatvianAnalyzer(Version.LUCENE_41));
    ha.put("nl", new DutchAnalyzer(Version.LUCENE_41));
    ha.put("no", new NorwegianAnalyzer(Version.LUCENE_41));
    ha.put("pt", new PortugueseAnalyzer(Version.LUCENE_41));
    ha.put("ro", new RomanianAnalyzer(Version.LUCENE_41));
    ha.put("ru", new RussianAnalyzer(Version.LUCENE_41));
    ha.put("sv", new SwedishAnalyzer(Version.LUCENE_41));
    ha.put("th", new ThaiAnalyzer(Version.LUCENE_41));
    ha.put("tr", new TurkishAnalyzer(Version.LUCENE_41));
    ha.put("cn", new SmartChineseAnalyzer(Version.LUCENE_41));

}