List of usage examples for org.apache.lucene.index IndexWriterConfig setSimilarity
public IndexWriterConfig setSimilarity(Similarity similarity)
From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java
License:Apache License
public static void main(String[] args) { Properties properties = new Properties(); InputStream input = null;/*from w w w . j ava 2 s. co m*/ try { if (System.getProperty("properties.path") != null) { input = new FileInputStream(System.getProperty("properties.path")); properties.load(input); } else { logger.info("Loading default property file [resources/lucene-clef.properties]"); ClassLoader loader = Thread.currentThread().getContextClassLoader(); input = loader.getResourceAsStream("lucene-clef.properties"); properties.load(input); } } catch (IOException ex) { ex.printStackTrace(); } finally { if (input != null) { try { input.close(); } catch (IOException e) { e.printStackTrace(); } } } properties.putAll(System.getProperties()); String language = properties.getProperty("language"); String stemmer = properties.getProperty("stemmer"); String stopsetType = properties.getProperty("stopset.type"); String stopsetPath = null; if (stopsetType.equalsIgnoreCase("CUSTOM")) { stopsetPath = properties.getProperty("stopset.path"); } String corporaRootPath = properties.getProperty("corpora.path"); int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size")); String[] corpora = properties.getProperty(language + ".corpora").split(";"); TrecContentSource trecContentSource = new TrecContentSource(); try { Properties configProps = new Properties(); configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser"); configProps.setProperty("content.source.verbose", "false"); configProps.setProperty("content.source.forever", "false"); configProps.setProperty("content.source.excludeIteration", "true"); configProps.setProperty("work.dir", new File(".").getAbsolutePath()); configProps.setProperty("language", language); configProps.setProperty("stemmer", stemmer); configProps.setProperty("stopset_type", stopsetType); configProps.setProperty("stopset_path", stopsetPath); // set lucene index directory Path indexPath = new File(properties.getProperty("index.path")).toPath(); Directory directory = new SimpleFSDirectory(indexPath); // indexing configuration CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath); Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset); IndexWriterConfig conf = new IndexWriterConfig(analyzer); conf.setSimilarity(new BM25Similarity()); conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter indexWriter = new IndexWriter(directory, conf); boolean storePositions = true; FieldType bodyFieldType = new FieldType(); if (storePositions) { bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); } else { bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); } for (String corpus : corpora) { int docCount = 0; logger.info("... indexing corpus " + corpus); try { configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus); configProps.setProperty("content.source.encoding", properties.getProperty(corpus + ".encoding", "UTF-8")); trecContentSource.setConfig(new Config(configProps)); DocData docData = new DocData(); while ((docData = trecContentSource.getNextDocData(docData)) != null) { docCount++; // System.out.println("ID: "+docData.getName()); // System.out.println("BODY: "+docData.getBody()); Document doc = getDocumentFromDocData(docData, bodyFieldType); indexWriter.addDocument(doc); } } catch (NoMoreDataException e) { logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n"); } } indexWriter.close(); DirectoryReader ireader = DirectoryReader.open(directory); if (corpusSize != ireader.numDocs()) { throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be " + corpusSize); } logger.info("Number of documents: " + ireader.numDocs()); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } }
From source file:nl.uva.DataHandler.Indexing.java
@Override public void Indexer(String indexPathString) throws Exception, Throwable { try {//from w ww.j a v a2 s . c o m log.info( "----------------------- INDEXING - Override Version: for BM25 :) --------------------------"); Path ipath = FileSystems.getDefault().getPath(indexPathString); super.IndexesCleaner(indexPathString); MyAnalyzer myAnalyzer; if (!super.stopWordsRemoving) myAnalyzer = new MyAnalyzer(super.stemming); else myAnalyzer = new MyAnalyzer(super.stemming, super.LoadStopwords()); Analyzer analyzer = myAnalyzer.getAnalyzer(configFile.getProperty("CORPUS_LANGUAGE")); PerFieldAnalyzerWrapper prfWrapper = new PerFieldAnalyzerWrapper(analyzer, super.analyzerMap); IndexWriterConfig irc = new IndexWriterConfig(prfWrapper); irc.setSimilarity(new BM25Similarity(1.2F, 0.75F)); this.writer = new IndexWriter(new SimpleFSDirectory(ipath), irc); this.docIndexer(); this.writer.commit(); this.writer.close(); analyzer.close(); prfWrapper.close(); log.info("-------------------------------------------------"); log.info("Index is created successfully..."); log.info("-------------------------------------------------"); } catch (Exception ex) { log.error(ex); throw ex; } }
From source file:org.apache.jackrabbit.core.query.lucene.AbstractIndex.java
License:Apache License
/** * Returns an <code>IndexWriter</code> on this index. * @return an <code>IndexWriter</code> on this index. * @throws IOException if the writer cannot be obtained. *//*from www . ja v a2 s . c o m*/ protected synchronized IndexWriter getIndexWriter() throws IOException { if (indexReader != null) { indexReader.close(); log.debug("closing IndexReader."); indexReader = null; } if (indexWriter == null) { IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer); config.setSimilarity(similarity); LogMergePolicy mergePolicy = new LogByteSizeMergePolicy(); mergePolicy.setUseCompoundFile(useCompoundFile); mergePolicy.setNoCFSRatio(1.0); config.setMergePolicy(mergePolicy); indexWriter = new IndexWriter(getDirectory(), config); indexWriter.setInfoStream(STREAM_LOGGER); } return indexWriter; }
From source file:org.apache.solr.update.SolrIndexConfig.java
License:Apache License
public IndexWriterConfig toIndexWriterConfig(IndexSchema schema) { // so that we can update the analyzer on core reload, we pass null // for the default analyzer, and explicitly pass an analyzer on // appropriate calls to IndexWriter IndexWriterConfig iwc = new IndexWriterConfig(luceneVersion, null); if (maxBufferedDocs != -1) iwc.setMaxBufferedDocs(maxBufferedDocs); if (ramBufferSizeMB != -1) iwc.setRAMBufferSizeMB(ramBufferSizeMB); if (termIndexInterval != -1) iwc.setTermIndexInterval(termIndexInterval); if (writeLockTimeout != -1) iwc.setWriteLockTimeout(writeLockTimeout); iwc.setSimilarity(schema.getSimilarity()); iwc.setMergePolicy(buildMergePolicy(schema)); iwc.setMergeScheduler(buildMergeScheduler(schema)); iwc.setInfoStream(infoStream);/*ww w.j a v a 2s . c om*/ // do this after buildMergePolicy since the backcompat logic // there may modify the effective useCompoundFile iwc.setUseCompoundFile(getUseCompoundFile()); if (maxIndexingThreads != -1) { iwc.setMaxThreadStates(maxIndexingThreads); } if (mergedSegmentWarmerInfo != null) { // TODO: add infostream -> normal logging system (there is an issue somewhere) IndexReaderWarmer warmer = schema.getResourceLoader().newInstance(mergedSegmentWarmerInfo.className, IndexReaderWarmer.class, null, new Class[] { InfoStream.class }, new Object[] { iwc.getInfoStream() }); iwc.setMergedSegmentWarmer(warmer); } return iwc; }
From source file:org.codice.ddf.spatial.geocoding.index.GeoNamesLuceneIndexer.java
License:Open Source License
IndexWriter createIndexWriter(final boolean create, final Directory directory) throws IOException { final IndexWriterConfig indexWriterConfig = new IndexWriterConfig(ANALYZER); // Set to CREATE mode if the index does not exist. if (!DirectoryReader.indexExists(directory)) { indexWriterConfig.setOpenMode(OpenMode.CREATE); } else {/*from w w w. j a v a 2 s . c om*/ indexWriterConfig.setOpenMode(create ? OpenMode.CREATE : OpenMode.APPEND); } indexWriterConfig.setSimilarity(SIMILARITY); return new IndexWriter(directory, indexWriterConfig); }
From source file:org.elasticsearch.index.engine.internal.AsynchronousEngine.java
License:Apache License
private IndexWriter createWriter() throws IOException { try {//from w ww . ja v a 2 s. c om boolean create = !Lucene.indexExists(store.directory()); IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION, analysisService.defaultIndexAnalyzer()); config.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND); config.setIndexDeletionPolicy(deletionPolicy); config.setInfoStream(new LoggerInfoStream(indexSettings, shardId)); config.setMergeScheduler(mergeScheduler.newMergeScheduler()); MergePolicy mergePolicy = mergePolicyProvider.getMergePolicy(); // Give us the opportunity to upgrade old segments while performing // background merges mergePolicy = new ElasticsearchMergePolicy(mergePolicy); config.setMergePolicy(mergePolicy); config.setSimilarity(similarityService.similarity()); config.setRAMBufferSizeMB(indexingBufferSize.mbFrac()); config.setMaxThreadStates(indexConcurrency); config.setCodec(codecService.codec(codecName)); /* We set this timeout to a highish value to work around * the default poll interval in the Lucene lock that is * 1000ms by default. We might need to poll multiple times * here but with 1s poll this is only executed twice at most * in combination with the default writelock timeout*/ config.setWriteLockTimeout(5000); config.setUseCompoundFile(this.compoundOnFlush); config.setCheckIntegrityAtMerge(checksumOnMerge); // Warm-up hook for newly-merged segments. Warming up segments here is better since it will be performed at the end // of the merge operation and won't slow down _refresh config.setMergedSegmentWarmer(new IndexReaderWarmer() { @Override public void warm(AtomicReader reader) throws IOException { try { assert isMergedSegment(reader); if (warmer != null) { final Engine.Searcher searcher = new SimpleSearcher("warmer", new IndexSearcher(reader)); final IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId, searcher); warmer.warmNewReaders(context); } } catch (Throwable t) { // Don't fail a merge if the warm-up failed if (!closed) { logger.warn("Warm-up failed", t); } if (t instanceof Error) { // assertion/out-of-memory error, don't ignore those throw (Error) t; } } } }); return new IndexWriter(store.directory(), config); } catch (LockObtainFailedException ex) { boolean isLocked = IndexWriter.isLocked(store.directory()); logger.warn("Could not lock IndexWriter isLocked [{}]", ex, isLocked); throw ex; } }
From source file:org.elasticsearch.index.engine.internal.InternalEngine.java
License:Apache License
private IndexWriter createWriter() throws IOException { try {/*from ww w. j a va 2 s .c om*/ // release locks when started if (IndexWriter.isLocked(store.directory())) { logger.warn("shard is locked, releasing lock"); IndexWriter.unlock(store.directory()); } boolean create = !Lucene.indexExists(store.directory()); IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION, analysisService.defaultIndexAnalyzer()); config.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND); config.setIndexDeletionPolicy(deletionPolicy); config.setMergeScheduler(mergeScheduler.newMergeScheduler()); MergePolicy mergePolicy = mergePolicyProvider.newMergePolicy(); // Give us the opportunity to upgrade old segments while performing // background merges mergePolicy = new IndexUpgraderMergePolicy(mergePolicy); config.setMergePolicy(mergePolicy); config.setSimilarity(similarityService.similarity()); config.setRAMBufferSizeMB(indexingBufferSize.mbFrac()); config.setMaxThreadStates(indexConcurrency); config.setCodec(codecService.codec(codecName)); /* We set this timeout to a highish value to work around * the default poll interval in the Lucene lock that is * 1000ms by default. We might need to poll multiple times * here but with 1s poll this is only executed twice at most * in combination with the default writelock timeout*/ config.setWriteLockTimeout(5000); config.setUseCompoundFile(this.compoundOnFlush); // Warm-up hook for newly-merged segments. Warming up segments here is better since it will be performed at the end // of the merge operation and won't slow down _refresh config.setMergedSegmentWarmer(new IndexReaderWarmer() { @Override public void warm(AtomicReader reader) throws IOException { try { assert isMergedSegment(reader); final Engine.Searcher searcher = new SimpleSearcher("warmer", new IndexSearcher(reader)); final IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId, searcher); if (warmer != null) warmer.warm(context); } catch (Throwable t) { // Don't fail a merge if the warm-up failed if (!closed) { logger.warn("Warm-up failed", t); } if (t instanceof Error) { // assertion/out-of-memory error, don't ignore those throw (Error) t; } } } }); return new IndexWriter(store.directory(), config); } catch (LockObtainFailedException ex) { boolean isLocked = IndexWriter.isLocked(store.directory()); logger.warn("Could not lock IndexWriter isLocked [{}]", ex, isLocked); throw ex; } }
From source file:org.elasticsearch.index.engine.InternalEngine.java
License:Apache License
private IndexWriter createWriter(boolean create) throws IOException { try {//from ww w. j a va2 s.com final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer()); iwc.setCommitOnClose(false); // we by default don't commit on close iwc.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND); iwc.setIndexDeletionPolicy(deletionPolicy); // with tests.verbose, lucene sets this up: plumb to align with filesystem stream boolean verbose = false; try { verbose = Boolean.parseBoolean(System.getProperty("tests.verbose")); } catch (Throwable ignore) { } iwc.setInfoStream(verbose ? InfoStream.getDefault() : new LoggerInfoStream(logger)); iwc.setMergeScheduler(mergeScheduler); MergePolicy mergePolicy = config().getMergePolicy(); // Give us the opportunity to upgrade old segments while performing // background merges mergePolicy = new ElasticsearchMergePolicy(mergePolicy); iwc.setMergePolicy(mergePolicy); iwc.setSimilarity(engineConfig.getSimilarity()); iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().mbFrac()); iwc.setCodec(engineConfig.getCodec()); /* We set this timeout to a highish value to work around * the default poll interval in the Lucene lock that is * 1000ms by default. We might need to poll multiple times * here but with 1s poll this is only executed twice at most * in combination with the default writelock timeout*/ iwc.setWriteLockTimeout(5000); iwc.setUseCompoundFile(this.engineConfig.isCompoundOnFlush()); // Warm-up hook for newly-merged segments. Warming up segments here is better since it will be performed at the end // of the merge operation and won't slow down _refresh iwc.setMergedSegmentWarmer(new IndexReaderWarmer() { @Override public void warm(LeafReader reader) throws IOException { try { LeafReader esLeafReader = new ElasticsearchLeafReader(reader, shardId); assert isMergedSegment(esLeafReader); if (warmer != null) { final Engine.Searcher searcher = new Searcher("warmer", searcherFactory.newSearcher(esLeafReader, null)); final IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId, searcher); warmer.warmNewReaders(context); } } catch (Throwable t) { // Don't fail a merge if the warm-up failed if (isClosed.get() == false) { logger.warn("Warm-up failed", t); } if (t instanceof Error) { // assertion/out-of-memory error, don't ignore those throw (Error) t; } } } }); return new IndexWriter(store.directory(), iwc); } catch (LockObtainFailedException ex) { boolean isLocked = IndexWriter.isLocked(store.directory()); logger.warn("Could not lock IndexWriter isLocked [{}]", ex, isLocked); throw ex; } }
From source file:org.elasticsearch.index.engine.robin.RobinEngine.java
License:Apache License
private IndexWriter createWriter() throws IOException { IndexWriter indexWriter = null;// w w w . jav a2 s . c o m try { // release locks when started if (IndexWriter.isLocked(store.directory())) { logger.warn("shard is locked, releasing lock"); IndexWriter.unlock(store.directory()); } boolean create = !IndexReader.indexExists(store.directory()); IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION, analysisService.defaultIndexAnalyzer()); config.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND); config.setIndexDeletionPolicy(deletionPolicy); config.setMergeScheduler(mergeScheduler.newMergeScheduler()); config.setMergePolicy(mergePolicyProvider.newMergePolicy()); config.setSimilarity(similarityService.defaultIndexSimilarity()); config.setRAMBufferSizeMB(indexingBufferSize.mbFrac()); config.setTermIndexInterval(termIndexInterval); config.setReaderTermsIndexDivisor(termIndexDivisor); config.setMaxThreadStates(indexConcurrency); indexWriter = new IndexWriter(store.directory(), config); } catch (IOException e) { safeClose(indexWriter); throw e; } return indexWriter; }
From source file:org.exoplatform.services.jcr.impl.core.query.lucene.AbstractIndex.java
License:Apache License
/** * Returns an <code>IndexWriter</code> on this index. * @return an <code>IndexWriter</code> on this index. * @throws IOException if the writer cannot be obtained. *//*from w w w .ja va 2 s .c o m*/ protected synchronized IndexWriter getIndexWriter() throws IOException { if (indexReader != null) { indexReader.close(); log.debug("closing IndexReader."); indexReader = null; } if (indexWriter == null) { IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_30, analyzer); config.setSimilarity(similarity); if (config.getMergePolicy() instanceof LogMergePolicy) { ((LogMergePolicy) config.getMergePolicy()).setUseCompoundFile(useCompoundFile); } else { log.error("Can't set \"UseCompoundFile\". Merge policy is not an instance of LogMergePolicy. "); } indexWriter = new IndexWriter(directory, config); setUseCompoundFile(useCompoundFile); indexWriter.setInfoStream(STREAM_LOGGER); } return indexWriter; }