List of usage examples for org.apache.lucene.analysis.de GermanAnalyzer GermanAnalyzer
public GermanAnalyzer(CharArraySet stopwords)
From source file:ch.admin.isb.hermes5.business.search.AnalyserRepository.java
License:Apache License
public Analyzer getAnalyzer(String lang) { if (lang.equals("fr")) { return new FrenchAnalyzer(Version.LUCENE_47); }/*from w ww .j ava 2 s.c o m*/ if (lang.equals("it")) { return new ItalianAnalyzer(Version.LUCENE_47); } if (lang.equals("en")) { return new EnglishAnalyzer(Version.LUCENE_47); } return new GermanAnalyzer(Version.LUCENE_47); }
From source file:com.bigdata.search.DefaultAnalyzerFactory.java
License:Open Source License
/** * Initializes the various kinds of analyzers that we know about. * <p>//from w ww.ja v a2s. co m * Note: Each {@link Analyzer} is registered under both the 3 letter and the * 2 letter language codes. See <a * href="http://www.loc.gov/standards/iso639-2/php/code_list.php">ISO 639-2</a>. * * @todo get some informed advice on which {@link Analyzer}s map onto which * language codes. * * @todo thread safety? Analyzers produce token processors so maybe there is * no problem here once things are initialized. If so, maybe this * could be static. * * @todo configuration. Could be configured by a file containing a class * name and a list of codes that are handled by that class. * * @todo strip language code down to 2/3 characters during lookup. * * @todo There are a lot of pidgins based on french, english, and other * languages that are not being assigned here. */ synchronized private Map<String, AnalyzerConstructor> getAnalyzers() { if (analyzers != null) { return analyzers; } analyzers = new HashMap<String, AnalyzerConstructor>(); final Set<?> emptyStopwords = Collections.EMPTY_SET; { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new BrazilianAnalyzer(Version.LUCENE_CURRENT) : new BrazilianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("por", a); analyzers.put("pt", a); } /* * Claims to handle Chinese. Does single character extraction. Claims to * produce smaller indices as a result. * * Note: you can not tokenize with the Chinese analyzer and the do * search using the CJK analyzer and visa versa. * * Note: I have no idea whether this would work for Japanese and Korean * as well. I expect so, but no real clue. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ChineseAnalyzer(); } }; analyzers.put("zho", a); analyzers.put("chi", a); analyzers.put("zh", a); } /* * Claims to handle Chinese, Japanese, Korean. Does double character * extraction with overlap. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CJKAnalyzer(Version.LUCENE_CURRENT) : new CJKAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; // analyzers.put("zho", a); // analyzers.put("chi", a); // analyzers.put("zh", a); analyzers.put("jpn", a); analyzers.put("ja", a); analyzers.put("jpn", a); analyzers.put("kor", a); analyzers.put("ko", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CzechAnalyzer(Version.LUCENE_CURRENT) : new CzechAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("ces", a); analyzers.put("cze", a); analyzers.put("cs", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new DutchAnalyzer(Version.LUCENE_CURRENT) : new DutchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("dut", a); analyzers.put("nld", a); analyzers.put("nl", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new FrenchAnalyzer(Version.LUCENE_CURRENT) : new FrenchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("fra", a); analyzers.put("fre", a); analyzers.put("fr", a); } /* * Note: There are a lot of language codes for German variants that * might be useful here. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GermanAnalyzer(Version.LUCENE_CURRENT) : new GermanAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("deu", a); analyzers.put("ger", a); analyzers.put("de", a); } // Note: ancient greek has a different code (grc). { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GreekAnalyzer(Version.LUCENE_CURRENT) : new GreekAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("gre", a); analyzers.put("ell", a); analyzers.put("el", a); } // @todo what about other Cyrillic scripts? { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new RussianAnalyzer(Version.LUCENE_CURRENT) : new RussianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("rus", a); analyzers.put("ru", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ThaiAnalyzer(Version.LUCENE_CURRENT); } }; analyzers.put("tha", a); analyzers.put("th", a); } // English { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new StandardAnalyzer(Version.LUCENE_CURRENT) : new StandardAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("eng", a); analyzers.put("en", a); /* * Note: There MUST be an entry under the empty string (""). This * entry will be requested when there is no entry for the specified * language code. */ analyzers.put("", a); } return analyzers; }
From source file:com.bizosys.hsearch.inpipe.TokenizeNonEnglish.java
License:Apache License
public void init(Configuration conf) throws ApplicationFault, SystemFault { languageMap.put("br", new BrazilianAnalyzer(LuceneConstants.version)); languageMap.put("cz", new CzechAnalyzer(LuceneConstants.version)); languageMap.put("nl", new DutchAnalyzer(LuceneConstants.version)); languageMap.put("fr", new FrenchAnalyzer(LuceneConstants.version)); languageMap.put("de", new GermanAnalyzer(LuceneConstants.version)); languageMap.put("el", new GreekAnalyzer(LuceneConstants.version)); languageMap.put("ru", new RussianAnalyzer(LuceneConstants.version)); languageMap.put("th", new ThaiAnalyzer(LuceneConstants.version)); }
From source file:de.berlinbuzzwords.AnalyzerExampleTest.java
License:Apache License
@Test public void testGermanAnalysis() throws IOException { String german = "Das Oktoberfest ist das grte Volksfest der Welt und es findet in der bayerischen Landeshauptstadt Mnchen."; printer.printTerms(new GermanAnalyzer(Version.LUCENE_43), german); }
From source file:de.innovationgate.wgpublisher.lucene.LuceneManager.java
License:Open Source License
/** * wga-configuration has changed, read new configuration and do necessary index updates *//*from ww w . j av a2 s . c o m*/ public synchronized void configurationHasChanged(Set newConnectedDBKeys) { if (!_started) { // skip config update if lucene manager has not been started (method startup called) yet // this happens on an initial WGA startup return; } if (_core.getWgaConfiguration().getLuceneManagerConfiguration().isUseLanguageAnalyzers()) { _core.addAnalyzerMapping("de", new GermanAnalyzer(org.apache.lucene.util.Version.LUCENE_35)); _core.addAnalyzerMapping("en", new EnglishAnalyzer(org.apache.lucene.util.Version.LUCENE_35)); _core.addAnalyzerMapping("it", new ItalianAnalyzer(org.apache.lucene.util.Version.LUCENE_35)); _core.addAnalyzerMapping("fr", new FrenchAnalyzer(org.apache.lucene.util.Version.LUCENE_35)); _core.addAnalyzerMapping("es", new SpanishAnalyzer(org.apache.lucene.util.Version.LUCENE_35)); } else { _core.removeAllAnalyzerMappings(); } _indexReleasedOnly = _core.getWgaConfiguration().getLuceneManagerConfiguration() .isIndexReleasedContentsOnly(); // check if each DB in _indexedDBKeys is in configfile and enabled by wga // if not create dropRequest Iterator itIndexedDbKeys = _indexedDbs.keySet().iterator(); while (itIndexedDbKeys.hasNext()) { String dbKey = (String) itIndexedDbKeys.next(); ContentStore dbConfig = _core.getWgaConfiguration().getContentStore(dbKey); if (dbConfig == null) { // indexed db not found in config, remove db and drop from index removeDatabase(dbKey, true); // remove from indexed dbs, cannot be done in removeDatabase() because of current iteration over indexedDbKeys //itIndexedDbKeys.remove(); // now done in removeDatabase via copy-replace } else if (!dbConfig.isEnabled()) { // if db was disabled, only remove from indexedDbs - do not drop index //itIndexedDbKeys.remove(); removeDatabase(dbKey, false); } } // get all active databases from core Iterator contentDbs = _core.getContentdbs().values().iterator(); while (contentDbs.hasNext()) { WGDatabase db = (WGDatabase) contentDbs.next(); // use db only if it is a real contentStore (has feature FULLCONTENTFEATURES) if ((db != null) && (db.hasFeature(WGDatabase.FEATURE_FULLCONTENTFEATURES))) { // WGA Plugins are not fulltext indexed if (db.getDbReference().startsWith(PluginConfig.PLUGIN_DBKEY_PREFIX)) { continue; } // If db not yet connected, listen for connect event and execute this method again when it happens if (!db.isConnected()) { db.addDatabaseConnectListener(this); continue; } createOrUpdateDBIndex(db, newConnectedDBKeys); } } }
From source file:fr.lipn.yasemir.Yasemir.java
License:Open Source License
/** * Initialisation method to be called before every action * @param configFile//from www . ja v a 2 s. c o m */ public static void init(String configFile) { System.err.println("Reading config file..."); ConfigurationHandler.init(configFile); //setting paths YASEMIR_HOME = ConfigurationHandler.YASEMIR_HOME; INDEX_DIR = YASEMIR_HOME + System.getProperty("file.separator") + ConfigurationHandler.INDEXDIR; TERM_DIR = YASEMIR_HOME + System.getProperty("file.separator") + ConfigurationHandler.TERMIDXDIR; //TERM_DIR=INDEX_DIR+System.getProperty("file.separator")+ConfigurationHandler.TERMIDXDIR; COLLECTION_DIR = ConfigurationHandler.CORPUSDIR; idField = ConfigurationHandler.DOCIDFIELD; ID_ASATTR = ConfigurationHandler.IDFIELD_ASATTR; DOC_DELIM = ConfigurationHandler.DOC_DELIM; COLLECTION_LANG = ConfigurationHandler.CORPUSLANG; if (COLLECTION_LANG.equals("fr")) analyzer = new FrenchAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("it")) analyzer = new ItalianAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("es")) analyzer = new SpanishAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("de")) analyzer = new GermanAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("pt")) analyzer = new PortugueseAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("ca")) analyzer = new CatalanAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("nl")) analyzer = new DutchAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("ar")) analyzer = new ArabicAnalyzer(Version.LUCENE_44); else analyzer = new EnglishAnalyzer(Version.LUCENE_44); //setting search mode String sm = ConfigurationHandler.SEARCH_MODE; if (sm != null) { if (sm.equalsIgnoreCase("semantic")) MODE = SEMANTIC; else if (sm.equalsIgnoreCase("hybrid")) MODE = HYBRID; else MODE = CLASSIC; } //setting concept similarity measure String smm = ConfigurationHandler.SIM_MEASURE; if (smm != null) { if (smm.equalsIgnoreCase("pg1")) SIM_MEASURE = ConceptSimilarity.PROXYGENEA1; else if (smm.equalsIgnoreCase("pg2")) SIM_MEASURE = ConceptSimilarity.PROXYGENEA2; else if (smm.equalsIgnoreCase("pg3")) SIM_MEASURE = ConceptSimilarity.PROXYGENEA3; else SIM_MEASURE = ConceptSimilarity.WU; } //setting concept weights String cw = ConfigurationHandler.CONCEPTWEIGHT; if (cw != null) { if (cw.equalsIgnoreCase("fixed")) CONCEPT_WEIGHTS = ClassWeightHandler.FIXED; else if (cw.equalsIgnoreCase("idf")) CONCEPT_WEIGHTS = ClassWeightHandler.IDF; else if (cw.equalsIgnoreCase("prob")) CONCEPT_WEIGHTS = ClassWeightHandler.PROB; else if (cw.equalsIgnoreCase("gauss")) CONCEPT_WEIGHTS = ClassWeightHandler.GAUSSPROB; } //setting annotator ANNOTATOR = ConfigurationHandler.ANNOTENGINE; annotator = new SentenceBasedAnnotator(TERM_DIR); //annotator=new KNNAnnotator(TERM_DIR); //TODO: not finished (select annotator depending on configuration file) try { Class<?> cls = Class.forName(ANNOTATOR); Constructor<?> constructor = cls.getConstructor(String.class); annotator = (SemanticAnnotator) constructor.newInstance(TERM_DIR); //Object instance = constructor.newInstance("stringparam"); } catch (Exception e) { e.printStackTrace(); System.err.println( "[YaSemIR]: failed to load the specified annotator, falling back to IndexBasedAnnotator"); annotator = annotator = new SentenceBasedAnnotator(TERM_DIR); } //setting ngrams enabled or not CKPD_ENABLED = ConfigurationHandler.NGRAMS_ENABLED; //setting semantic fields semBalises = new HashSet<String>(); semBalises.addAll(ConfigurationHandler.getSemanticFields()); //setting classic fields clsBalises = new HashSet<String>(); clsBalises.addAll(ConfigurationHandler.getClassicFields()); //setting score type SCORE = ConfigurationHandler.SCORE; //setting ontologies and terminologies System.err.println("[YaSemIR]: Loading Knowledge Battery..."); HashMap<String, String> ontoSKOSconf = ConfigurationHandler.getOntologySKOSMap(); HashMap<String, String> ontoRootconf = ConfigurationHandler.getOntologyRootMap(); for (String ontoLoc : ontoSKOSconf.keySet()) { String ontoRoot = ontoRootconf.get(ontoLoc); Ontology o = null; if (ontoRoot.trim().isEmpty()) o = new Ontology(ontoLoc); else o = new Ontology(ontoLoc, ontoRoot); System.err.println("[YaSemIR]: loaded ontology: " + o.getBaseAddr() + " at " + ontoLoc); String termPath = ontoSKOSconf.get(ontoLoc); SKOSTerminology t = null; if (!termPath.trim().isEmpty()) { System.err.println("[YaSemIR]: loading terminology from " + termPath); t = new SKOSTerminology(o.getOntologyID(), termPath); } else { System.err.println("[YaSemIR]: no terminology provided: generating trivial terminology from " + o.getBaseAddr() + "..."); t = o.generateTerminology(); } System.err.println("[YaSemIR]: loaded terminology: " + t.getTerminologyID()); KnowledgeBattery.addOntology(o, t); } if (INDEXING_MODE) KnowledgeBattery.createTermIndex(); System.err.println("[YaSemIR]: Done."); }
From source file:it.unipd.dei.ims.lucene.clef.AnalyzerFactory.java
License:Apache License
public static Analyzer createAnalyzer(String language, String stemmer, CharArraySet stopset) { Analyzer analyzer;/* w w w. jav a 2 s . co m*/ if (stemmer.equalsIgnoreCase("NONE")) { analyzer = new StandardAnalyzer(stopset); } else { // otherwise use language-specific analyzer switch (language) { case "bg": analyzer = new BulgarianAnalyzer(stopset); break; case "de": analyzer = new GermanAnalyzer(stopset); break; case "es": analyzer = new SpanishAnalyzer(stopset); break; case "fa": analyzer = new PersianAnalyzer(stopset); break; case "fi": analyzer = new FinnishAnalyzer(stopset); break; case "fr": analyzer = new FrenchAnalyzer(stopset); break; case "hu": analyzer = new HungarianAnalyzer(stopset); break; case "it": analyzer = new ItalianAnalyzer(stopset); break; case "nl": analyzer = new DutchAnalyzer(stopset); break; case "pt": analyzer = new PortugueseAnalyzer(stopset); break; case "ru": analyzer = new RussianAnalyzer(stopset); break; case "sv": analyzer = new SwedishAnalyzer(stopset); break; default: throw new UnsupportedOperationException("Language not supported yet"); } } return analyzer; }
From source file:net.mad.ads.server.utils.http.KeywordUtils.java
License:Open Source License
public static List<String> getTokens(String queryString) { try {//from w ww.j a v a 2 s . c o m GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_33); TokenStream ts = a.tokenStream("", new StringReader(queryString)); List<String> tokens = new ArrayList<String>(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String token = termAtt.toString(); tokens.add(token); } ts.end(); ts.close(); return tokens; } catch (IOException e) { logger.error("", e); } return null; }
From source file:org.apache.jackrabbit.core.query.lucene.LanguageCustomizingAnalyzerRegistry.java
License:Open Source License
public LanguageCustomizingAnalyzerRegistry(IndexingConfiguration configuration) { this.configuration = configuration; languageToAnalyzer.put("ar", new AnalyzerWrapper(new ArabicAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("br", new AnalyzerWrapper(new BrazilianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("cjk", new AnalyzerWrapper(new CJKAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("cn", new AnalyzerWrapper(new ChineseAnalyzer(), true)); languageToAnalyzer.put("cz", new AnalyzerWrapper(new CzechAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("de", new AnalyzerWrapper(new GermanAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("el", new AnalyzerWrapper(new GreekAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("en", new AnalyzerWrapper( new SnowballAnalyzer(Version.LUCENE_30, "English", StopAnalyzer.ENGLISH_STOP_WORDS_SET), true)); languageToAnalyzer.put("fa", new AnalyzerWrapper(new PersianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("fr", new AnalyzerWrapper(new FrenchAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("nl", new AnalyzerWrapper(new DutchAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("ru", new AnalyzerWrapper(new RussianAnalyzer(Version.LUCENE_30), true)); languageToAnalyzer.put("th", new AnalyzerWrapper(new ThaiAnalyzer(Version.LUCENE_30), true)); }
From source file:org.compass.core.lucene.engine.analyzer.ExtendedAnalyzerBuilderDelegate.java
License:Apache License
public Analyzer buildAnalyzer(String analyzerName, CompassSettings settings, DefaultLuceneAnalyzerFactory analyzerFactory) throws SearchEngineException { String analyzerSetting = settings.getSetting(LuceneEnvironment.Analyzer.TYPE, LuceneEnvironment.Analyzer.CoreTypes.STANDARD); Analyzer analyzer = null;/*from w w w. j a va 2s . co m*/ if (LuceneEnvironment.Analyzer.ExtendedTypes.BRAZILIAN.equalsIgnoreCase(analyzerSetting)) { analyzer = new BrazilianAnalyzer( analyzerFactory.parseStopWords(analyzerName, settings, BrazilianAnalyzer.BRAZILIAN_STOP_WORDS)); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.CJK.equalsIgnoreCase(analyzerSetting)) { analyzer = new CJKAnalyzer( analyzerFactory.parseStopWords(analyzerName, settings, CJKAnalyzer.STOP_WORDS)); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.CHINESE.equalsIgnoreCase(analyzerSetting)) { analyzer = new ChineseAnalyzer(); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.CZECH.equalsIgnoreCase(analyzerSetting)) { analyzer = new CzechAnalyzer( analyzerFactory.parseStopWords(analyzerName, settings, CzechAnalyzer.CZECH_STOP_WORDS)); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.GERMAN.equalsIgnoreCase(analyzerSetting)) { analyzer = new GermanAnalyzer( analyzerFactory.parseStopWords(analyzerName, settings, GermanAnalyzer.GERMAN_STOP_WORDS)); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.GREEK.equalsIgnoreCase(analyzerSetting)) { analyzer = new GreekAnalyzer(); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.FRENCH.equalsIgnoreCase(analyzerSetting)) { analyzer = new FrenchAnalyzer( analyzerFactory.parseStopWords(analyzerName, settings, FrenchAnalyzer.FRENCH_STOP_WORDS)); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.DUTCH.equalsIgnoreCase(analyzerSetting)) { analyzer = new DutchAnalyzer( analyzerFactory.parseStopWords(analyzerName, settings, DutchAnalyzer.DUTCH_STOP_WORDS)); } else if (LuceneEnvironment.Analyzer.ExtendedTypes.RUSSIAN.equalsIgnoreCase(analyzerSetting)) { analyzer = new RussianAnalyzer(); } return analyzer; }