List of usage examples for org.apache.lucene.analysis.el GreekAnalyzer GreekAnalyzer
public GreekAnalyzer(CharArraySet stopwords)
From source file:com.bigdata.search.DefaultAnalyzerFactory.java
License:Open Source License
/** * Initializes the various kinds of analyzers that we know about. * <p>/*w w w . j a v a 2 s . c o m*/ * Note: Each {@link Analyzer} is registered under both the 3 letter and the * 2 letter language codes. See <a * href="http://www.loc.gov/standards/iso639-2/php/code_list.php">ISO 639-2</a>. * * @todo get some informed advice on which {@link Analyzer}s map onto which * language codes. * * @todo thread safety? Analyzers produce token processors so maybe there is * no problem here once things are initialized. If so, maybe this * could be static. * * @todo configuration. Could be configured by a file containing a class * name and a list of codes that are handled by that class. * * @todo strip language code down to 2/3 characters during lookup. * * @todo There are a lot of pidgins based on french, english, and other * languages that are not being assigned here. */ synchronized private Map<String, AnalyzerConstructor> getAnalyzers() { if (analyzers != null) { return analyzers; } analyzers = new HashMap<String, AnalyzerConstructor>(); final Set<?> emptyStopwords = Collections.EMPTY_SET; { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new BrazilianAnalyzer(Version.LUCENE_CURRENT) : new BrazilianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("por", a); analyzers.put("pt", a); } /* * Claims to handle Chinese. Does single character extraction. Claims to * produce smaller indices as a result. * * Note: you can not tokenize with the Chinese analyzer and the do * search using the CJK analyzer and visa versa. * * Note: I have no idea whether this would work for Japanese and Korean * as well. I expect so, but no real clue. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ChineseAnalyzer(); } }; analyzers.put("zho", a); analyzers.put("chi", a); analyzers.put("zh", a); } /* * Claims to handle Chinese, Japanese, Korean. Does double character * extraction with overlap. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CJKAnalyzer(Version.LUCENE_CURRENT) : new CJKAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; // analyzers.put("zho", a); // analyzers.put("chi", a); // analyzers.put("zh", a); analyzers.put("jpn", a); analyzers.put("ja", a); analyzers.put("jpn", a); analyzers.put("kor", a); analyzers.put("ko", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CzechAnalyzer(Version.LUCENE_CURRENT) : new CzechAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("ces", a); analyzers.put("cze", a); analyzers.put("cs", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new DutchAnalyzer(Version.LUCENE_CURRENT) : new DutchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("dut", a); analyzers.put("nld", a); analyzers.put("nl", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new FrenchAnalyzer(Version.LUCENE_CURRENT) : new FrenchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("fra", a); analyzers.put("fre", a); analyzers.put("fr", a); } /* * Note: There are a lot of language codes for German variants that * might be useful here. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GermanAnalyzer(Version.LUCENE_CURRENT) : new GermanAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("deu", a); analyzers.put("ger", a); analyzers.put("de", a); } // Note: ancient greek has a different code (grc). { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GreekAnalyzer(Version.LUCENE_CURRENT) : new GreekAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("gre", a); analyzers.put("ell", a); analyzers.put("el", a); } // @todo what about other Cyrillic scripts? { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new RussianAnalyzer(Version.LUCENE_CURRENT) : new RussianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("rus", a); analyzers.put("ru", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ThaiAnalyzer(Version.LUCENE_CURRENT); } }; analyzers.put("tha", a); analyzers.put("th", a); } // English { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new StandardAnalyzer(Version.LUCENE_CURRENT) : new StandardAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("eng", a); analyzers.put("en", a); /* * Note: There MUST be an entry under the empty string (""). This * entry will be requested when there is no entry for the specified * language code. */ analyzers.put("", a); } return analyzers; }
From source file:com.bizosys.hsearch.inpipe.TokenizeNonEnglish.java
License:Apache License
public void init(Configuration conf) throws ApplicationFault, SystemFault { languageMap.put("br", new BrazilianAnalyzer(LuceneConstants.version)); languageMap.put("cz", new CzechAnalyzer(LuceneConstants.version)); languageMap.put("nl", new DutchAnalyzer(LuceneConstants.version)); languageMap.put("fr", new FrenchAnalyzer(LuceneConstants.version)); languageMap.put("de", new GermanAnalyzer(LuceneConstants.version)); languageMap.put("el", new GreekAnalyzer(LuceneConstants.version)); languageMap.put("ru", new RussianAnalyzer(LuceneConstants.version)); languageMap.put("th", new ThaiAnalyzer(LuceneConstants.version)); }
From source file:org.apache.jackrabbit.core.query.lucene.LanguageCustomizingAnalyzerRegistry.java
License:Open Source License
public LanguageCustomizingAnalyzerRegistry(IndexingConfiguration configuration) { this.configuration = configuration; languageToAnalyzer.put("ar", new AnalyzerWrapper(new ArabicAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("br", new AnalyzerWrapper(new BrazilianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("cjk", new AnalyzerWrapper(new CJKAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("cn", new AnalyzerWrapper(new ChineseAnalyzer(), true)); languageToAnalyzer.put("cz", new AnalyzerWrapper(new CzechAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("de", new AnalyzerWrapper(new GermanAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("el", new AnalyzerWrapper(new GreekAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("en", new AnalyzerWrapper( new SnowballAnalyzer(Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET), true)); languageToAnalyzer.put("fa", new AnalyzerWrapper(new PersianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("fr", new AnalyzerWrapper(new FrenchAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("nl", new AnalyzerWrapper(new DutchAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("ru", new AnalyzerWrapper(new RussianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("th", new AnalyzerWrapper(new ThaiAnalyzer(Version.LUCENE_30), true)); }
From source file:org.eclipse.help.internal.search.AnalyzerFactory.java
License:Open Source License
public Analyzer create() { if (locale == null) return null; Version version = Version.LUCENE_35; if ("pt".equals(locale)) //$NON-NLS-1$ return new BrazilianAnalyzer(version); if ("ja".equals(locale)) //$NON-NLS-1$ return new CJKAnalyzer(version); if ("ko".equals(locale)) //$NON-NLS-1$ return new CJKAnalyzer(version); if ("pt".equals(locale)) //$NON-NLS-1$ return new BrazilianAnalyzer(version); if ("cs".equals(locale)) //$NON-NLS-1$ return new CzechAnalyzer(version); if ("de".equals(locale)) //$NON-NLS-1$ return new GermanAnalyzer(version); if ("el".equals(locale)) //$NON-NLS-1$ return new GreekAnalyzer(version); if ("fr".equals(locale)) //$NON-NLS-1$ return new FrenchAnalyzer(version); if ("nl".equals(locale)) //$NON-NLS-1$ return new DutchAnalyzer(version); if ("ru".equals(locale)) //$NON-NLS-1$ return new RussianAnalyzer(version); //unknown language return null;//from w w w. j av a 2s . co m }
From source file:org.elasticsearch.analysis.common.GreekAnalyzerProvider.java
License:Apache License
GreekAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);
analyzer = new GreekAnalyzer(Analysis.parseStopWords(env, settings, GreekAnalyzer.getDefaultStopSet()));
analyzer.setVersion(version);// w ww .j a v a 2 s. com
}
From source file:org.omegat.tokenizer.LuceneGreekTokenizer.java
License:Open Source License
@Override protected TokenStream getTokenStream(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed) { if (stemsAllowed) { GreekAnalyzer an;// w w w. j a v a 2 s .co m if (stopWordsAllowed) { an = new GreekAnalyzer(getBehavior()); } else { an = new GreekAnalyzer(getBehavior(), EMPTY_STRING_LIST); } return an.tokenStream("", new StringReader(strOrig)); } else { return new StandardTokenizer(getBehavior(), new StringReader(strOrig)); } }
From source file:org.scify.NewSumServer.Server.Searching.Indexer.java
License:Apache License
/** * The Main method of the Indexer Class. * Traverses a directory and creates the index files needed for the package to * operate.//from w w w. j av a 2 s . c om * @throws CorruptIndexException * @throws LockObtainFailedException * @throws IOException */ public void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException { // The dir containing the Files to Index File docDir = new File(this.sFilesPath); Directory FSDir = FSDirectory.open(indexDir); //init the Analyzer, according to locale if (lLoc.toString().equals("el")) { anal = new GreekAnalyzer(Version.LUCENE_36); } else if (lLoc.toString().equals("en")) { // The standard analyzer Analyzer stdAnal = new StandardAnalyzer(Version.LUCENE_36); // In order to index all the text in a field, // however long that field may be anal = new LimitTokenCountAnalyzer(stdAnal, Integer.MAX_VALUE); } // The configuration for the Index Writer IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, anal); conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE); // The Index Writer IndexWriter indexWriter = new IndexWriter(FSDir, conf); // For each File in the dir, create a Document for (File file : getFilesFromFirstLeverSubdirs(docDir)) { String filename = file.getName(); String fullFileName = file.getAbsolutePath(); String tmpText = Utilities.readFromFile(fullFileName, " "); Document d = new Document(); //lucene Document // Add the "filename" field d.add(new Field(FILE_FIELD, filename, Field.Store.YES, Field.Index.NOT_ANALYZED)); // Add The "Text" Field d.add(new Field(TEXT_FIELD, tmpText, Field.Store.YES, Field.Index.ANALYZED)); // Add the Document to the Writer indexWriter.addDocument(d); } int numDocs = indexWriter.numDocs(); // the index will be merged down into a single segment, resulting in // a smaller index with better search performance. Costly Operation, // DO NOT USE on large dirs or when low disk space (needs (2-3)*DirSize) indexWriter.forceMerge(1); // Syncs All referenced Index Files. // At this point old indexes will be deleted, freeing up space indexWriter.commit(); // Terminate the Writer appropriately indexWriter.close(); // LOGGER.log(Level.INFO, "Succesfully closed indexWriter with {0}", anal.toString()); }
From source file:org.scify.NewSumServer.Server.Searching.Searcher.java
License:Apache License
/** * Searches the index directory for the specified query. * @param fIndexDir The directory where the indexed files are stored * @param lLoc The locale that the indexed is created in * @param sQuery The search term//w ww . j a v a 2s . c o m * @param iMaxHits The max number of results to be returned. * @return A list of scoredocs which correspond to the search entry */ public List<ScoreDoc> searchIndex(File fIndexDir, Locale lLoc, String sQuery, int iMaxHits) { try { // Open the Directory of the Indexed Files, using // the FSDirectory class Directory FSDir = FSDirectory.open(fIndexDir); // Create the reader class on the Dir IndexReader reader = IndexReader.open(FSDir); IndexSearcher searcher = new IndexSearcher(reader); String dField = "text"; // Pass this from the Indexer Class? // Must Use the Same Analyzer as the index Class, otherwise // results will be awkward. So it get's analyzer from Indexer class // Create the query Parser on the Field that we want to parse if (lLoc.toString().equals("el")) { anal = new GreekAnalyzer(Version.LUCENE_36); } else if (lLoc.toString().equals("en")) { // The standard analyzer Analyzer stdAnal = new StandardAnalyzer(Version.LUCENE_36); anal = new LimitTokenCountAnalyzer(stdAnal, Integer.MAX_VALUE); } QueryParser parser = new QueryParser(Version.LUCENE_36, dField, anal); try { Query q = parser.parse(sQuery); // Search the Index with the Query TopDocs hits = searcher.search(q, iMaxHits); ScoreDoc[] scoreDocs = hits.scoreDocs; //debug start System.out.println("files found: " + scoreDocs.length); //debug end // Iterate over the scoredocs for (int n = 0; n < scoreDocs.length; n++) { ScoreDoc sd = scoreDocs[n]; float score = sd.score; int docId = sd.doc; Document d = searcher.doc(docId); String filename = d.get("file"); // System.out.println //debug // (filename+": "+"Score: "+score+" - "+ "DocID: "+ docId); //Save the <docID, filename> data to the map this.docFiles.put(docId, filename); } // Sort the Docs according to their scores List<ScoreDoc> returnList = sortScoreDocs(scoreDocs); Collections.reverse(returnList); return returnList; } catch (ParseException ex) { LOGGER.log(Level.SEVERE, "Could not parse query {0}", sQuery); } catch (NullPointerException ex) { LOGGER.log(Level.WARNING, ex.getMessage()); return null; } } catch (IOException ex) { LOGGER.log(Level.SEVERE, "Could not open Directory {0}", fIndexDir.getPath()); } return null; }
From source file:perLucene.Server.java
License:Open Source License
private static void initAnalyzers() { ha = new HashMap<String, Analyzer>(); ha.put("ar", new ArabicAnalyzer(Version.LUCENE_41)); ha.put("el", new GreekAnalyzer(Version.LUCENE_41)); ha.put("bg", new BulgarianAnalyzer(Version.LUCENE_41)); ha.put("br", new BrazilianAnalyzer(Version.LUCENE_41)); ha.put("ca", new CatalanAnalyzer(Version.LUCENE_41)); ha.put("cz", new CzechAnalyzer(Version.LUCENE_41)); ha.put("da", new DanishAnalyzer(Version.LUCENE_41)); ha.put("de", new GermanAnalyzer(Version.LUCENE_41)); ha.put("en", new EnglishAnalyzer(Version.LUCENE_41)); ha.put("es", new SpanishAnalyzer(Version.LUCENE_41)); ha.put("eu", new BasqueAnalyzer(Version.LUCENE_41)); ha.put("fa", new PersianAnalyzer(Version.LUCENE_41)); ha.put("fi", new FinnishAnalyzer(Version.LUCENE_41)); ha.put("fr", new FrenchAnalyzer(Version.LUCENE_41)); ha.put("ga", new IrishAnalyzer(Version.LUCENE_41)); ha.put("gl", new GalicianAnalyzer(Version.LUCENE_41)); ha.put("hi", new HindiAnalyzer(Version.LUCENE_41)); ha.put("hu", new HungarianAnalyzer(Version.LUCENE_41)); ha.put("hy", new ArmenianAnalyzer(Version.LUCENE_41)); ha.put("id", new IndonesianAnalyzer(Version.LUCENE_41)); ha.put("it", new ItalianAnalyzer(Version.LUCENE_41)); ha.put("lv", new LatvianAnalyzer(Version.LUCENE_41)); ha.put("nl", new DutchAnalyzer(Version.LUCENE_41)); ha.put("no", new NorwegianAnalyzer(Version.LUCENE_41)); ha.put("pt", new PortugueseAnalyzer(Version.LUCENE_41)); ha.put("ro", new RomanianAnalyzer(Version.LUCENE_41)); ha.put("ru", new RussianAnalyzer(Version.LUCENE_41)); ha.put("sv", new SwedishAnalyzer(Version.LUCENE_41)); ha.put("th", new ThaiAnalyzer(Version.LUCENE_41)); ha.put("tr", new TurkishAnalyzer(Version.LUCENE_41)); ha.put("cn", new SmartChineseAnalyzer(Version.LUCENE_41)); }