Example usage for org.apache.lucene.index IndexWriterConfig setSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriterConfig setSimilarity.

Prototype

public IndexWriterConfig setSimilarity(Similarity similarity)

Source Link

Document

Expert: set the Similarity implementation used by this IndexWriter.

Usage

From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java

License:Apache License

public static void main(String[] args) {

    Properties properties = new Properties();
    InputStream input = null;/*from  w  w  w  . j  ava 2 s. co m*/
    try {
        if (System.getProperty("properties.path") != null) {
            input = new FileInputStream(System.getProperty("properties.path"));
            properties.load(input);
        } else {
            logger.info("Loading default property file [resources/lucene-clef.properties]");
            ClassLoader loader = Thread.currentThread().getContextClassLoader();
            input = loader.getResourceAsStream("lucene-clef.properties");
            properties.load(input);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    } finally {
        if (input != null) {
            try {
                input.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    properties.putAll(System.getProperties());

    String language = properties.getProperty("language");

    String stemmer = properties.getProperty("stemmer");

    String stopsetType = properties.getProperty("stopset.type");

    String stopsetPath = null;
    if (stopsetType.equalsIgnoreCase("CUSTOM")) {
        stopsetPath = properties.getProperty("stopset.path");
    }

    String corporaRootPath = properties.getProperty("corpora.path");

    int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size"));

    String[] corpora = properties.getProperty(language + ".corpora").split(";");

    TrecContentSource trecContentSource = new TrecContentSource();

    try {

        Properties configProps = new Properties();
        configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser");
        configProps.setProperty("content.source.verbose", "false");
        configProps.setProperty("content.source.forever", "false");
        configProps.setProperty("content.source.excludeIteration", "true");
        configProps.setProperty("work.dir", new File(".").getAbsolutePath());
        configProps.setProperty("language", language);
        configProps.setProperty("stemmer", stemmer);
        configProps.setProperty("stopset_type", stopsetType);
        configProps.setProperty("stopset_path", stopsetPath);

        // set lucene index directory
        Path indexPath = new File(properties.getProperty("index.path")).toPath();
        Directory directory = new SimpleFSDirectory(indexPath);

        // indexing configuration

        CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath);

        Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset);

        IndexWriterConfig conf = new IndexWriterConfig(analyzer);
        conf.setSimilarity(new BM25Similarity());
        conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        IndexWriter indexWriter = new IndexWriter(directory, conf);
        boolean storePositions = true;
        FieldType bodyFieldType = new FieldType();
        if (storePositions) {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        } else {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        }

        for (String corpus : corpora) {

            int docCount = 0;

            logger.info("... indexing corpus " + corpus);

            try {

                configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus);

                configProps.setProperty("content.source.encoding",
                        properties.getProperty(corpus + ".encoding", "UTF-8"));

                trecContentSource.setConfig(new Config(configProps));

                DocData docData = new DocData();
                while ((docData = trecContentSource.getNextDocData(docData)) != null) {
                    docCount++;
                    //                    System.out.println("ID: "+docData.getName());
                    //                    System.out.println("BODY: "+docData.getBody());
                    Document doc = getDocumentFromDocData(docData, bodyFieldType);
                    indexWriter.addDocument(doc);
                }

            } catch (NoMoreDataException e) {
                logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n");
            }

        }

        indexWriter.close();

        DirectoryReader ireader = DirectoryReader.open(directory);
        if (corpusSize != ireader.numDocs()) {
            throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be "
                    + corpusSize);
        }
        logger.info("Number of documents: " + ireader.numDocs());

    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:nl.uva.DataHandler.Indexing.java

@Override
public void Indexer(String indexPathString) throws Exception, Throwable {
    try {//from   w  ww.j  a v a2  s  .  c o m
        log.info(
                "----------------------- INDEXING - Override Version: for BM25 :)  --------------------------");

        Path ipath = FileSystems.getDefault().getPath(indexPathString);
        super.IndexesCleaner(indexPathString);
        MyAnalyzer myAnalyzer;
        if (!super.stopWordsRemoving)
            myAnalyzer = new MyAnalyzer(super.stemming);
        else
            myAnalyzer = new MyAnalyzer(super.stemming, super.LoadStopwords());

        Analyzer analyzer = myAnalyzer.getAnalyzer(configFile.getProperty("CORPUS_LANGUAGE"));
        PerFieldAnalyzerWrapper prfWrapper = new PerFieldAnalyzerWrapper(analyzer, super.analyzerMap);
        IndexWriterConfig irc = new IndexWriterConfig(prfWrapper);
        irc.setSimilarity(new BM25Similarity(1.2F, 0.75F));
        this.writer = new IndexWriter(new SimpleFSDirectory(ipath), irc);
        this.docIndexer();
        this.writer.commit();
        this.writer.close();
        analyzer.close();
        prfWrapper.close();
        log.info("-------------------------------------------------");
        log.info("Index is created successfully...");
        log.info("-------------------------------------------------");

    } catch (Exception ex) {
        log.error(ex);
        throw ex;
    }
}

From source file:org.apache.jackrabbit.core.query.lucene.AbstractIndex.java

License:Apache License

/**
 * Returns an <code>IndexWriter</code> on this index.
 * @return an <code>IndexWriter</code> on this index.
 * @throws IOException if the writer cannot be obtained.
 *//*from  www  . ja v  a2 s . c  o m*/
protected synchronized IndexWriter getIndexWriter() throws IOException {
    if (indexReader != null) {
        indexReader.close();
        log.debug("closing IndexReader.");
        indexReader = null;
    }
    if (indexWriter == null) {
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        config.setSimilarity(similarity);
        LogMergePolicy mergePolicy = new LogByteSizeMergePolicy();
        mergePolicy.setUseCompoundFile(useCompoundFile);
        mergePolicy.setNoCFSRatio(1.0);
        config.setMergePolicy(mergePolicy);

        indexWriter = new IndexWriter(getDirectory(), config);
        indexWriter.setInfoStream(STREAM_LOGGER);
    }
    return indexWriter;
}

From source file:org.apache.solr.update.SolrIndexConfig.java

License:Apache License

public IndexWriterConfig toIndexWriterConfig(IndexSchema schema) {
    // so that we can update the analyzer on core reload, we pass null
    // for the default analyzer, and explicitly pass an analyzer on 
    // appropriate calls to IndexWriter

    IndexWriterConfig iwc = new IndexWriterConfig(luceneVersion, null);
    if (maxBufferedDocs != -1)
        iwc.setMaxBufferedDocs(maxBufferedDocs);

    if (ramBufferSizeMB != -1)
        iwc.setRAMBufferSizeMB(ramBufferSizeMB);

    if (termIndexInterval != -1)
        iwc.setTermIndexInterval(termIndexInterval);

    if (writeLockTimeout != -1)
        iwc.setWriteLockTimeout(writeLockTimeout);

    iwc.setSimilarity(schema.getSimilarity());
    iwc.setMergePolicy(buildMergePolicy(schema));
    iwc.setMergeScheduler(buildMergeScheduler(schema));
    iwc.setInfoStream(infoStream);/*ww w.j  a v a 2s . c  om*/

    // do this after buildMergePolicy since the backcompat logic 
    // there may modify the effective useCompoundFile
    iwc.setUseCompoundFile(getUseCompoundFile());

    if (maxIndexingThreads != -1) {
        iwc.setMaxThreadStates(maxIndexingThreads);
    }

    if (mergedSegmentWarmerInfo != null) {
        // TODO: add infostream -> normal logging system (there is an issue somewhere)
        IndexReaderWarmer warmer = schema.getResourceLoader().newInstance(mergedSegmentWarmerInfo.className,
                IndexReaderWarmer.class, null, new Class[] { InfoStream.class },
                new Object[] { iwc.getInfoStream() });
        iwc.setMergedSegmentWarmer(warmer);
    }

    return iwc;
}

From source file:org.codice.ddf.spatial.geocoding.index.GeoNamesLuceneIndexer.java

License:Open Source License

IndexWriter createIndexWriter(final boolean create, final Directory directory) throws IOException {

    final IndexWriterConfig indexWriterConfig = new IndexWriterConfig(ANALYZER);

    // Set to CREATE mode if the index does not exist.
    if (!DirectoryReader.indexExists(directory)) {
        indexWriterConfig.setOpenMode(OpenMode.CREATE);
    } else {/*from w w  w.  j  a v  a  2 s  . c  om*/
        indexWriterConfig.setOpenMode(create ? OpenMode.CREATE : OpenMode.APPEND);
    }
    indexWriterConfig.setSimilarity(SIMILARITY);
    return new IndexWriter(directory, indexWriterConfig);
}

From source file:org.elasticsearch.index.engine.internal.AsynchronousEngine.java

License:Apache License

private IndexWriter createWriter() throws IOException {
    try {//from  w  ww  .  ja v a  2 s. c  om
        boolean create = !Lucene.indexExists(store.directory());
        IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION,
                analysisService.defaultIndexAnalyzer());
        config.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND);
        config.setIndexDeletionPolicy(deletionPolicy);
        config.setInfoStream(new LoggerInfoStream(indexSettings, shardId));
        config.setMergeScheduler(mergeScheduler.newMergeScheduler());
        MergePolicy mergePolicy = mergePolicyProvider.getMergePolicy();
        // Give us the opportunity to upgrade old segments while performing
        // background merges
        mergePolicy = new ElasticsearchMergePolicy(mergePolicy);
        config.setMergePolicy(mergePolicy);
        config.setSimilarity(similarityService.similarity());
        config.setRAMBufferSizeMB(indexingBufferSize.mbFrac());
        config.setMaxThreadStates(indexConcurrency);
        config.setCodec(codecService.codec(codecName));
        /* We set this timeout to a highish value to work around
         * the default poll interval in the Lucene lock that is
         * 1000ms by default. We might need to poll multiple times
         * here but with 1s poll this is only executed twice at most
         * in combination with the default writelock timeout*/
        config.setWriteLockTimeout(5000);
        config.setUseCompoundFile(this.compoundOnFlush);
        config.setCheckIntegrityAtMerge(checksumOnMerge);
        // Warm-up hook for newly-merged segments. Warming up segments here is better since it will be performed at the end
        // of the merge operation and won't slow down _refresh
        config.setMergedSegmentWarmer(new IndexReaderWarmer() {
            @Override
            public void warm(AtomicReader reader) throws IOException {
                try {
                    assert isMergedSegment(reader);
                    if (warmer != null) {
                        final Engine.Searcher searcher = new SimpleSearcher("warmer",
                                new IndexSearcher(reader));
                        final IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId,
                                searcher);
                        warmer.warmNewReaders(context);
                    }
                } catch (Throwable t) {
                    // Don't fail a merge if the warm-up failed
                    if (!closed) {
                        logger.warn("Warm-up failed", t);
                    }
                    if (t instanceof Error) {
                        // assertion/out-of-memory error, don't ignore those
                        throw (Error) t;
                    }
                }
            }
        });
        return new IndexWriter(store.directory(), config);
    } catch (LockObtainFailedException ex) {
        boolean isLocked = IndexWriter.isLocked(store.directory());
        logger.warn("Could not lock IndexWriter isLocked [{}]", ex, isLocked);
        throw ex;
    }
}

From source file:org.elasticsearch.index.engine.internal.InternalEngine.java

License:Apache License

private IndexWriter createWriter() throws IOException {
    try {/*from   ww w.  j  a va  2  s  .c om*/
        // release locks when started
        if (IndexWriter.isLocked(store.directory())) {
            logger.warn("shard is locked, releasing lock");
            IndexWriter.unlock(store.directory());
        }
        boolean create = !Lucene.indexExists(store.directory());
        IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION,
                analysisService.defaultIndexAnalyzer());
        config.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND);
        config.setIndexDeletionPolicy(deletionPolicy);
        config.setMergeScheduler(mergeScheduler.newMergeScheduler());
        MergePolicy mergePolicy = mergePolicyProvider.newMergePolicy();
        // Give us the opportunity to upgrade old segments while performing
        // background merges
        mergePolicy = new IndexUpgraderMergePolicy(mergePolicy);
        config.setMergePolicy(mergePolicy);
        config.setSimilarity(similarityService.similarity());
        config.setRAMBufferSizeMB(indexingBufferSize.mbFrac());
        config.setMaxThreadStates(indexConcurrency);
        config.setCodec(codecService.codec(codecName));
        /* We set this timeout to a highish value to work around
         * the default poll interval in the Lucene lock that is
         * 1000ms by default. We might need to poll multiple times
         * here but with 1s poll this is only executed twice at most
         * in combination with the default writelock timeout*/
        config.setWriteLockTimeout(5000);
        config.setUseCompoundFile(this.compoundOnFlush);
        // Warm-up hook for newly-merged segments. Warming up segments here is better since it will be performed at the end
        // of the merge operation and won't slow down _refresh
        config.setMergedSegmentWarmer(new IndexReaderWarmer() {
            @Override
            public void warm(AtomicReader reader) throws IOException {
                try {
                    assert isMergedSegment(reader);
                    final Engine.Searcher searcher = new SimpleSearcher("warmer", new IndexSearcher(reader));
                    final IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId,
                            searcher);
                    if (warmer != null)
                        warmer.warm(context);
                } catch (Throwable t) {
                    // Don't fail a merge if the warm-up failed
                    if (!closed) {
                        logger.warn("Warm-up failed", t);
                    }
                    if (t instanceof Error) {
                        // assertion/out-of-memory error, don't ignore those
                        throw (Error) t;
                    }
                }
            }
        });
        return new IndexWriter(store.directory(), config);
    } catch (LockObtainFailedException ex) {
        boolean isLocked = IndexWriter.isLocked(store.directory());
        logger.warn("Could not lock IndexWriter isLocked [{}]", ex, isLocked);
        throw ex;
    }
}

From source file:org.elasticsearch.index.engine.InternalEngine.java

License:Apache License

private IndexWriter createWriter(boolean create) throws IOException {
    try {//from   ww  w. j a  va2  s.com
        final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer());
        iwc.setCommitOnClose(false); // we by default don't commit on close
        iwc.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND);
        iwc.setIndexDeletionPolicy(deletionPolicy);
        // with tests.verbose, lucene sets this up: plumb to align with filesystem stream
        boolean verbose = false;
        try {
            verbose = Boolean.parseBoolean(System.getProperty("tests.verbose"));
        } catch (Throwable ignore) {
        }
        iwc.setInfoStream(verbose ? InfoStream.getDefault() : new LoggerInfoStream(logger));
        iwc.setMergeScheduler(mergeScheduler);
        MergePolicy mergePolicy = config().getMergePolicy();
        // Give us the opportunity to upgrade old segments while performing
        // background merges
        mergePolicy = new ElasticsearchMergePolicy(mergePolicy);
        iwc.setMergePolicy(mergePolicy);
        iwc.setSimilarity(engineConfig.getSimilarity());
        iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().mbFrac());
        iwc.setCodec(engineConfig.getCodec());
        /* We set this timeout to a highish value to work around
         * the default poll interval in the Lucene lock that is
         * 1000ms by default. We might need to poll multiple times
         * here but with 1s poll this is only executed twice at most
         * in combination with the default writelock timeout*/
        iwc.setWriteLockTimeout(5000);
        iwc.setUseCompoundFile(this.engineConfig.isCompoundOnFlush());
        // Warm-up hook for newly-merged segments. Warming up segments here is better since it will be performed at the end
        // of the merge operation and won't slow down _refresh
        iwc.setMergedSegmentWarmer(new IndexReaderWarmer() {
            @Override
            public void warm(LeafReader reader) throws IOException {
                try {
                    LeafReader esLeafReader = new ElasticsearchLeafReader(reader, shardId);
                    assert isMergedSegment(esLeafReader);
                    if (warmer != null) {
                        final Engine.Searcher searcher = new Searcher("warmer",
                                searcherFactory.newSearcher(esLeafReader, null));
                        final IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId,
                                searcher);
                        warmer.warmNewReaders(context);
                    }
                } catch (Throwable t) {
                    // Don't fail a merge if the warm-up failed
                    if (isClosed.get() == false) {
                        logger.warn("Warm-up failed", t);
                    }
                    if (t instanceof Error) {
                        // assertion/out-of-memory error, don't ignore those
                        throw (Error) t;
                    }
                }
            }
        });
        return new IndexWriter(store.directory(), iwc);
    } catch (LockObtainFailedException ex) {
        boolean isLocked = IndexWriter.isLocked(store.directory());
        logger.warn("Could not lock IndexWriter isLocked [{}]", ex, isLocked);
        throw ex;
    }
}

From source file:org.elasticsearch.index.engine.robin.RobinEngine.java

License:Apache License

private IndexWriter createWriter() throws IOException {
    IndexWriter indexWriter = null;// w w w  .  jav a2  s . c  o  m
    try {
        // release locks when started
        if (IndexWriter.isLocked(store.directory())) {
            logger.warn("shard is locked, releasing lock");
            IndexWriter.unlock(store.directory());
        }
        boolean create = !IndexReader.indexExists(store.directory());
        IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION,
                analysisService.defaultIndexAnalyzer());
        config.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND);
        config.setIndexDeletionPolicy(deletionPolicy);
        config.setMergeScheduler(mergeScheduler.newMergeScheduler());
        config.setMergePolicy(mergePolicyProvider.newMergePolicy());
        config.setSimilarity(similarityService.defaultIndexSimilarity());
        config.setRAMBufferSizeMB(indexingBufferSize.mbFrac());
        config.setTermIndexInterval(termIndexInterval);
        config.setReaderTermsIndexDivisor(termIndexDivisor);
        config.setMaxThreadStates(indexConcurrency);

        indexWriter = new IndexWriter(store.directory(), config);
    } catch (IOException e) {
        safeClose(indexWriter);
        throw e;
    }
    return indexWriter;
}

From source file:org.exoplatform.services.jcr.impl.core.query.lucene.AbstractIndex.java

License:Apache License

/**
 * Returns an <code>IndexWriter</code> on this index.
 * @return an <code>IndexWriter</code> on this index.
 * @throws IOException if the writer cannot be obtained.
 *//*from  w w w .ja  va  2 s .c o m*/
protected synchronized IndexWriter getIndexWriter() throws IOException {
    if (indexReader != null) {
        indexReader.close();
        log.debug("closing IndexReader.");
        indexReader = null;
    }
    if (indexWriter == null) {
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_30, analyzer);
        config.setSimilarity(similarity);
        if (config.getMergePolicy() instanceof LogMergePolicy) {
            ((LogMergePolicy) config.getMergePolicy()).setUseCompoundFile(useCompoundFile);
        } else {
            log.error("Can't set \"UseCompoundFile\". Merge policy is not an instance of LogMergePolicy. ");
        }
        indexWriter = new IndexWriter(directory, config);
        setUseCompoundFile(useCompoundFile);
        indexWriter.setInfoStream(STREAM_LOGGER);
    }
    return indexWriter;
}