List of usage examples for org.apache.lucene.analysis.it ItalianAnalyzer ItalianAnalyzer
public ItalianAnalyzer(CharArraySet stopwords)
From source file:calliope.search.AeseSearch.java
License:Open Source License
/** * Create a language-specific analyzer//from w w w . j a v a2s . c om * @param langCode the language code * @return an analyzer for that language */ private static Analyzer createAnalyzer(String langCode) { if (langCode.equals("it")) return new ItalianAnalyzer(Version.LUCENE_45); else // add other analyzers here return new StandardAnalyzer(Version.LUCENE_45); }
From source file:ch.admin.isb.hermes5.business.search.AnalyserRepository.java
License:Apache License
public Analyzer getAnalyzer(String lang) { if (lang.equals("fr")) { return new FrenchAnalyzer(Version.LUCENE_47); }/*from w w w .j a va 2 s . c o m*/ if (lang.equals("it")) { return new ItalianAnalyzer(Version.LUCENE_47); } if (lang.equals("en")) { return new EnglishAnalyzer(Version.LUCENE_47); } return new GermanAnalyzer(Version.LUCENE_47); }
From source file:com.github.tteofili.looseen.TestWikipediaClassification.java
License:Apache License
@Test public void testItalianWikipedia() throws Exception { String indexProperty = System.getProperty("index"); if (indexProperty != null) { try {/*from ww w. j ava2s. c o m*/ index = Boolean.valueOf(indexProperty); } catch (Exception e) { // ignore } } String splitProperty = System.getProperty("split"); if (splitProperty != null) { try { split = Boolean.valueOf(splitProperty); } catch (Exception e) { // ignore } } Path mainIndexPath = Paths.get(INDEX + "/original"); Directory directory = FSDirectory.open(mainIndexPath); Path trainPath = Paths.get(INDEX + "/train"); Path testPath = Paths.get(INDEX + "/test"); Path cvPath = Paths.get(INDEX + "/cv"); FSDirectory cv = null; FSDirectory test = null; FSDirectory train = null; DirectoryReader testReader = null; if (split) { cv = FSDirectory.open(cvPath); test = FSDirectory.open(testPath); train = FSDirectory.open(trainPath); } if (index) { delete(mainIndexPath); if (split) { delete(trainPath, testPath, cvPath); } } IndexReader reader = null; try { Collection<String> stopWordsList = Arrays.asList("di", "a", "da", "in", "per", "tra", "fra", "il", "lo", "la", "i", "gli", "le"); CharArraySet stopWords = new CharArraySet(stopWordsList, true); Analyzer analyzer = new ItalianAnalyzer(stopWords); if (index) { System.out.format("Indexing Italian Wikipedia...%n"); long startIndex = System.currentTimeMillis(); IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer)); importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current1.xml"), indexWriter); importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current2.xml"), indexWriter); importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current3.xml"), indexWriter); importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current4.xml"), indexWriter); long endIndex = System.currentTimeMillis(); System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(), (endIndex - startIndex) / 1000); indexWriter.close(); } if (split && !index) { reader = DirectoryReader.open(train); } else { reader = DirectoryReader.open(directory); } if (index && split) { // split the index System.out.format("Splitting the index...%n"); long startSplit = System.currentTimeMillis(); DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0); for (LeafReaderContext context : reader.leaves()) { datasetSplitter.split(context.reader(), train, test, cv, analyzer, false, CATEGORY_FIELD, TEXT_FIELD, CATEGORY_FIELD); } reader.close(); reader = DirectoryReader.open(train); // using the train index from now on long endSplit = System.currentTimeMillis(); System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000); } final long startTime = System.currentTimeMillis(); List<Classifier<BytesRef>> classifiers = new LinkedList<>(); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new BM25Similarity(), analyzer, null, 1, 0, 0, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 1, 100)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 10, 1, 100)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 1, 100)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 100)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 300)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 3, 100)); classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 3, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD)); int maxdoc; if (split) { testReader = DirectoryReader.open(test); maxdoc = testReader.maxDoc(); } else { maxdoc = reader.maxDoc(); } System.out.format("Starting evaluation on %d docs...%n", maxdoc); ExecutorService service = Executors.newCachedThreadPool(); List<Future<String>> futures = new LinkedList<>(); for (Classifier<BytesRef> classifier : classifiers) { final IndexReader finalReader = reader; final DirectoryReader finalTestReader = testReader; futures.add(service.submit(() -> { ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix; if (split) { confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalTestReader, classifier, CATEGORY_FIELD, TEXT_FIELD, 60000 * 30); } else { confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalReader, classifier, CATEGORY_FIELD, TEXT_FIELD, 60000 * 30); } final long endTime = System.currentTimeMillis(); final int elapse = (int) (endTime - startTime) / 1000; return " * " + classifier + " \n * accuracy = " + confusionMatrix.getAccuracy() + "\n * precision = " + confusionMatrix.getPrecision() + "\n * recall = " + confusionMatrix.getRecall() + "\n * f1-measure = " + confusionMatrix.getF1Measure() + "\n * avgClassificationTime = " + confusionMatrix.getAvgClassificationTime() + "\n * time = " + elapse + " (sec)\n "; })); } for (Future<String> f : futures) { System.out.println(f.get()); } Thread.sleep(10000); service.shutdown(); } finally { try { if (reader != null) { reader.close(); } if (directory != null) { directory.close(); } if (test != null) { test.close(); } if (train != null) { train.close(); } if (cv != null) { cv.close(); } if (testReader != null) { testReader.close(); } } catch (Throwable e) { e.printStackTrace(); } } }
From source file:de.innovationgate.wgpublisher.lucene.LuceneManager.java
License:Open Source License
/** * wga-configuration has changed, read new configuration and do necessary index updates *///ww w . j av a 2 s . c om public synchronized void configurationHasChanged(Set newConnectedDBKeys) { if (!_started) { // skip config update if lucene manager has not been started (method startup called) yet // this happens on an initial WGA startup return; } if (_core.getWgaConfiguration().getLuceneManagerConfiguration().isUseLanguageAnalyzers()) { _core.addAnalyzerMapping("de", new GermanAnalyzer(org.apache.lucene.util.Version.LUCENE_35)); _core.addAnalyzerMapping("en", new EnglishAnalyzer(org.apache.lucene.util.Version.LUCENE_35)); _core.addAnalyzerMapping("it", new ItalianAnalyzer(org.apache.lucene.util.Version.LUCENE_35)); _core.addAnalyzerMapping("fr", new FrenchAnalyzer(org.apache.lucene.util.Version.LUCENE_35)); _core.addAnalyzerMapping("es", new SpanishAnalyzer(org.apache.lucene.util.Version.LUCENE_35)); } else { _core.removeAllAnalyzerMappings(); } _indexReleasedOnly = _core.getWgaConfiguration().getLuceneManagerConfiguration() .isIndexReleasedContentsOnly(); // check if each DB in _indexedDBKeys is in configfile and enabled by wga // if not create dropRequest Iterator itIndexedDbKeys = _indexedDbs.keySet().iterator(); while (itIndexedDbKeys.hasNext()) { String dbKey = (String) itIndexedDbKeys.next(); ContentStore dbConfig = _core.getWgaConfiguration().getContentStore(dbKey); if (dbConfig == null) { // indexed db not found in config, remove db and drop from index removeDatabase(dbKey, true); // remove from indexed dbs, cannot be done in removeDatabase() because of current iteration over indexedDbKeys //itIndexedDbKeys.remove(); // now done in removeDatabase via copy-replace } else if (!dbConfig.isEnabled()) { // if db was disabled, only remove from indexedDbs - do not drop index //itIndexedDbKeys.remove(); removeDatabase(dbKey, false); } } // get all active databases from core Iterator contentDbs = _core.getContentdbs().values().iterator(); while (contentDbs.hasNext()) { WGDatabase db = (WGDatabase) contentDbs.next(); // use db only if it is a real contentStore (has feature FULLCONTENTFEATURES) if ((db != null) && (db.hasFeature(WGDatabase.FEATURE_FULLCONTENTFEATURES))) { // WGA Plugins are not fulltext indexed if (db.getDbReference().startsWith(PluginConfig.PLUGIN_DBKEY_PREFIX)) { continue; } // If db not yet connected, listen for connect event and execute this method again when it happens if (!db.isConnected()) { db.addDatabaseConnectListener(this); continue; } createOrUpdateDBIndex(db, newConnectedDBKeys); } } }
From source file:fr.lipn.yasemir.Yasemir.java
License:Open Source License
/** * Initialisation method to be called before every action * @param configFile//w ww. ja v a 2 s . c o m */ public static void init(String configFile) { System.err.println("Reading config file..."); ConfigurationHandler.init(configFile); //setting paths YASEMIR_HOME = ConfigurationHandler.YASEMIR_HOME; INDEX_DIR = YASEMIR_HOME + System.getProperty("file.separator") + ConfigurationHandler.INDEXDIR; TERM_DIR = YASEMIR_HOME + System.getProperty("file.separator") + ConfigurationHandler.TERMIDXDIR; //TERM_DIR=INDEX_DIR+System.getProperty("file.separator")+ConfigurationHandler.TERMIDXDIR; COLLECTION_DIR = ConfigurationHandler.CORPUSDIR; idField = ConfigurationHandler.DOCIDFIELD; ID_ASATTR = ConfigurationHandler.IDFIELD_ASATTR; DOC_DELIM = ConfigurationHandler.DOC_DELIM; COLLECTION_LANG = ConfigurationHandler.CORPUSLANG; if (COLLECTION_LANG.equals("fr")) analyzer = new FrenchAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("it")) analyzer = new ItalianAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("es")) analyzer = new SpanishAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("de")) analyzer = new GermanAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("pt")) analyzer = new PortugueseAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("ca")) analyzer = new CatalanAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("nl")) analyzer = new DutchAnalyzer(Version.LUCENE_44); else if (COLLECTION_LANG.equals("ar")) analyzer = new ArabicAnalyzer(Version.LUCENE_44); else analyzer = new EnglishAnalyzer(Version.LUCENE_44); //setting search mode String sm = ConfigurationHandler.SEARCH_MODE; if (sm != null) { if (sm.equalsIgnoreCase("semantic")) MODE = SEMANTIC; else if (sm.equalsIgnoreCase("hybrid")) MODE = HYBRID; else MODE = CLASSIC; } //setting concept similarity measure String smm = ConfigurationHandler.SIM_MEASURE; if (smm != null) { if (smm.equalsIgnoreCase("pg1")) SIM_MEASURE = ConceptSimilarity.PROXYGENEA1; else if (smm.equalsIgnoreCase("pg2")) SIM_MEASURE = ConceptSimilarity.PROXYGENEA2; else if (smm.equalsIgnoreCase("pg3")) SIM_MEASURE = ConceptSimilarity.PROXYGENEA3; else SIM_MEASURE = ConceptSimilarity.WU; } //setting concept weights String cw = ConfigurationHandler.CONCEPTWEIGHT; if (cw != null) { if (cw.equalsIgnoreCase("fixed")) CONCEPT_WEIGHTS = ClassWeightHandler.FIXED; else if (cw.equalsIgnoreCase("idf")) CONCEPT_WEIGHTS = ClassWeightHandler.IDF; else if (cw.equalsIgnoreCase("prob")) CONCEPT_WEIGHTS = ClassWeightHandler.PROB; else if (cw.equalsIgnoreCase("gauss")) CONCEPT_WEIGHTS = ClassWeightHandler.GAUSSPROB; } //setting annotator ANNOTATOR = ConfigurationHandler.ANNOTENGINE; annotator = new SentenceBasedAnnotator(TERM_DIR); //annotator=new KNNAnnotator(TERM_DIR); //TODO: not finished (select annotator depending on configuration file) try { Class<?> cls = Class.forName(ANNOTATOR); Constructor<?> constructor = cls.getConstructor(String.class); annotator = (SemanticAnnotator) constructor.newInstance(TERM_DIR); //Object instance = constructor.newInstance("stringparam"); } catch (Exception e) { e.printStackTrace(); System.err.println( "[YaSemIR]: failed to load the specified annotator, falling back to IndexBasedAnnotator"); annotator = annotator = new SentenceBasedAnnotator(TERM_DIR); } //setting ngrams enabled or not CKPD_ENABLED = ConfigurationHandler.NGRAMS_ENABLED; //setting semantic fields semBalises = new HashSet<String>(); semBalises.addAll(ConfigurationHandler.getSemanticFields()); //setting classic fields clsBalises = new HashSet<String>(); clsBalises.addAll(ConfigurationHandler.getClassicFields()); //setting score type SCORE = ConfigurationHandler.SCORE; //setting ontologies and terminologies System.err.println("[YaSemIR]: Loading Knowledge Battery..."); HashMap<String, String> ontoSKOSconf = ConfigurationHandler.getOntologySKOSMap(); HashMap<String, String> ontoRootconf = ConfigurationHandler.getOntologyRootMap(); for (String ontoLoc : ontoSKOSconf.keySet()) { String ontoRoot = ontoRootconf.get(ontoLoc); Ontology o = null; if (ontoRoot.trim().isEmpty()) o = new Ontology(ontoLoc); else o = new Ontology(ontoLoc, ontoRoot); System.err.println("[YaSemIR]: loaded ontology: " + o.getBaseAddr() + " at " + ontoLoc); String termPath = ontoSKOSconf.get(ontoLoc); SKOSTerminology t = null; if (!termPath.trim().isEmpty()) { System.err.println("[YaSemIR]: loading terminology from " + termPath); t = new SKOSTerminology(o.getOntologyID(), termPath); } else { System.err.println("[YaSemIR]: no terminology provided: generating trivial terminology from " + o.getBaseAddr() + "..."); t = o.generateTerminology(); } System.err.println("[YaSemIR]: loaded terminology: " + t.getTerminologyID()); KnowledgeBattery.addOntology(o, t); } if (INDEXING_MODE) KnowledgeBattery.createTermIndex(); System.err.println("[YaSemIR]: Done."); }
From source file:it.unipd.dei.ims.lucene.clef.AnalyzerFactory.java
License:Apache License
public static Analyzer createAnalyzer(String language, String stemmer, CharArraySet stopset) { Analyzer analyzer;// w ww .jav a2 s.co m if (stemmer.equalsIgnoreCase("NONE")) { analyzer = new StandardAnalyzer(stopset); } else { // otherwise use language-specific analyzer switch (language) { case "bg": analyzer = new BulgarianAnalyzer(stopset); break; case "de": analyzer = new GermanAnalyzer(stopset); break; case "es": analyzer = new SpanishAnalyzer(stopset); break; case "fa": analyzer = new PersianAnalyzer(stopset); break; case "fi": analyzer = new FinnishAnalyzer(stopset); break; case "fr": analyzer = new FrenchAnalyzer(stopset); break; case "hu": analyzer = new HungarianAnalyzer(stopset); break; case "it": analyzer = new ItalianAnalyzer(stopset); break; case "nl": analyzer = new DutchAnalyzer(stopset); break; case "pt": analyzer = new PortugueseAnalyzer(stopset); break; case "ru": analyzer = new RussianAnalyzer(stopset); break; case "sv": analyzer = new SwedishAnalyzer(stopset); break; default: throw new UnsupportedOperationException("Language not supported yet"); } } return analyzer; }
From source file:perLucene.Server.java
License:Open Source License
private static void initAnalyzers() { ha = new HashMap<String, Analyzer>(); ha.put("ar", new ArabicAnalyzer(Version.LUCENE_41)); ha.put("el", new GreekAnalyzer(Version.LUCENE_41)); ha.put("bg", new BulgarianAnalyzer(Version.LUCENE_41)); ha.put("br", new BrazilianAnalyzer(Version.LUCENE_41)); ha.put("ca", new CatalanAnalyzer(Version.LUCENE_41)); ha.put("cz", new CzechAnalyzer(Version.LUCENE_41)); ha.put("da", new DanishAnalyzer(Version.LUCENE_41)); ha.put("de", new GermanAnalyzer(Version.LUCENE_41)); ha.put("en", new EnglishAnalyzer(Version.LUCENE_41)); ha.put("es", new SpanishAnalyzer(Version.LUCENE_41)); ha.put("eu", new BasqueAnalyzer(Version.LUCENE_41)); ha.put("fa", new PersianAnalyzer(Version.LUCENE_41)); ha.put("fi", new FinnishAnalyzer(Version.LUCENE_41)); ha.put("fr", new FrenchAnalyzer(Version.LUCENE_41)); ha.put("ga", new IrishAnalyzer(Version.LUCENE_41)); ha.put("gl", new GalicianAnalyzer(Version.LUCENE_41)); ha.put("hi", new HindiAnalyzer(Version.LUCENE_41)); ha.put("hu", new HungarianAnalyzer(Version.LUCENE_41)); ha.put("hy", new ArmenianAnalyzer(Version.LUCENE_41)); ha.put("id", new IndonesianAnalyzer(Version.LUCENE_41)); ha.put("it", new ItalianAnalyzer(Version.LUCENE_41)); ha.put("lv", new LatvianAnalyzer(Version.LUCENE_41)); ha.put("nl", new DutchAnalyzer(Version.LUCENE_41)); ha.put("no", new NorwegianAnalyzer(Version.LUCENE_41)); ha.put("pt", new PortugueseAnalyzer(Version.LUCENE_41)); ha.put("ro", new RomanianAnalyzer(Version.LUCENE_41)); ha.put("ru", new RussianAnalyzer(Version.LUCENE_41)); ha.put("sv", new SwedishAnalyzer(Version.LUCENE_41)); ha.put("th", new ThaiAnalyzer(Version.LUCENE_41)); ha.put("tr", new TurkishAnalyzer(Version.LUCENE_41)); ha.put("cn", new SmartChineseAnalyzer(Version.LUCENE_41)); }