Example usage for org.apache.lucene.index IndexWriterConfig setSimilarity

List of usage examples for org.apache.lucene.index IndexWriterConfig setSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriterConfig setSimilarity.

Prototype

public IndexWriterConfig setSimilarity(Similarity similarity) 

Source Link

Document

Expert: set the Similarity implementation used by this IndexWriter.

Usage

From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java

License:Apache License

public static void main(String[] args) {

    Properties properties = new Properties();
    InputStream input = null;/*from  w  w  w  . j  ava 2 s. co m*/
    try {
        if (System.getProperty("properties.path") != null) {
            input = new FileInputStream(System.getProperty("properties.path"));
            properties.load(input);
        } else {
            logger.info("Loading default property file [resources/lucene-clef.properties]");
            ClassLoader loader = Thread.currentThread().getContextClassLoader();
            input = loader.getResourceAsStream("lucene-clef.properties");
            properties.load(input);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    } finally {
        if (input != null) {
            try {
                input.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    properties.putAll(System.getProperties());

    String language = properties.getProperty("language");

    String stemmer = properties.getProperty("stemmer");

    String stopsetType = properties.getProperty("stopset.type");

    String stopsetPath = null;
    if (stopsetType.equalsIgnoreCase("CUSTOM")) {
        stopsetPath = properties.getProperty("stopset.path");
    }

    String corporaRootPath = properties.getProperty("corpora.path");

    int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size"));

    String[] corpora = properties.getProperty(language + ".corpora").split(";");

    TrecContentSource trecContentSource = new TrecContentSource();

    try {

        Properties configProps = new Properties();
        configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser");
        configProps.setProperty("content.source.verbose", "false");
        configProps.setProperty("content.source.forever", "false");
        configProps.setProperty("content.source.excludeIteration", "true");
        configProps.setProperty("work.dir", new File(".").getAbsolutePath());
        configProps.setProperty("language", language);
        configProps.setProperty("stemmer", stemmer);
        configProps.setProperty("stopset_type", stopsetType);
        configProps.setProperty("stopset_path", stopsetPath);

        // set lucene index directory
        Path indexPath = new File(properties.getProperty("index.path")).toPath();
        Directory directory = new SimpleFSDirectory(indexPath);

        // indexing configuration

        CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath);

        Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset);

        IndexWriterConfig conf = new IndexWriterConfig(analyzer);
        conf.setSimilarity(new BM25Similarity());
        conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

        IndexWriter indexWriter = new IndexWriter(directory, conf);
        boolean storePositions = true;
        FieldType bodyFieldType = new FieldType();
        if (storePositions) {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        } else {
            bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        }

        for (String corpus : corpora) {

            int docCount = 0;

            logger.info("... indexing corpus " + corpus);

            try {

                configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus);

                configProps.setProperty("content.source.encoding",
                        properties.getProperty(corpus + ".encoding", "UTF-8"));

                trecContentSource.setConfig(new Config(configProps));

                DocData docData = new DocData();
                while ((docData = trecContentSource.getNextDocData(docData)) != null) {
                    docCount++;
                    //                    System.out.println("ID: "+docData.getName());
                    //                    System.out.println("BODY: "+docData.getBody());
                    Document doc = getDocumentFromDocData(docData, bodyFieldType);
                    indexWriter.addDocument(doc);
                }

            } catch (NoMoreDataException e) {
                logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n");
            }

        }

        indexWriter.close();

        DirectoryReader ireader = DirectoryReader.open(directory);
        if (corpusSize != ireader.numDocs()) {
            throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be "
                    + corpusSize);
        }
        logger.info("Number of documents: " + ireader.numDocs());

    } catch (IOException e) {
        e.printStackTrace();
    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:nl.uva.DataHandler.Indexing.java

@Override
public void Indexer(String indexPathString) throws Exception, Throwable {
    try {//from   w  ww.j  a v a2  s  .  c o m
        log.info(
                "----------------------- INDEXING - Override Version: for BM25 :)  --------------------------");

        Path ipath = FileSystems.getDefault().getPath(indexPathString);
        super.IndexesCleaner(indexPathString);
        MyAnalyzer myAnalyzer;
        if (!super.stopWordsRemoving)
            myAnalyzer = new MyAnalyzer(super.stemming);
        else
            myAnalyzer = new MyAnalyzer(super.stemming, super.LoadStopwords());

        Analyzer analyzer = myAnalyzer.getAnalyzer(configFile.getProperty("CORPUS_LANGUAGE"));
        PerFieldAnalyzerWrapper prfWrapper = new PerFieldAnalyzerWrapper(analyzer, super.analyzerMap);
        IndexWriterConfig irc = new IndexWriterConfig(prfWrapper);
        irc.setSimilarity(new BM25Similarity(1.2F, 0.75F));
        this.writer = new IndexWriter(new SimpleFSDirectory(ipath), irc);
        this.docIndexer();
        this.writer.commit();
        this.writer.close();
        analyzer.close();
        prfWrapper.close();
        log.info("-------------------------------------------------");
        log.info("Index is created successfully...");
        log.info("-------------------------------------------------");

    } catch (Exception ex) {
        log.error(ex);
        throw ex;
    }
}

From source file:org.apache.jackrabbit.core.query.lucene.AbstractIndex.java

License:Apache License

/**
 * Returns an <code>IndexWriter</code> on this index.
 * @return an <code>IndexWriter</code> on this index.
 * @throws IOException if the writer cannot be obtained.
 *//*from  www  . ja v  a2 s . c  o m*/
protected synchronized IndexWriter getIndexWriter() throws IOException {
    if (indexReader != null) {
        indexReader.close();
        log.debug("closing IndexReader.");
        indexReader = null;
    }
    if (indexWriter == null) {
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        config.setSimilarity(similarity);
        LogMergePolicy mergePolicy = new LogByteSizeMergePolicy();
        mergePolicy.setUseCompoundFile(useCompoundFile);
        mergePolicy.setNoCFSRatio(1.0);
        config.setMergePolicy(mergePolicy);

        indexWriter = new IndexWriter(getDirectory(), config);
        indexWriter.setInfoStream(STREAM_LOGGER);
    }
    return indexWriter;
}

From source file:org.apache.solr.update.SolrIndexConfig.java

License:Apache License

public IndexWriterConfig toIndexWriterConfig(IndexSchema schema) {
    // so that we can update the analyzer on core reload, we pass null
    // for the default analyzer, and explicitly pass an analyzer on 
    // appropriate calls to IndexWriter

    IndexWriterConfig iwc = new IndexWriterConfig(luceneVersion, null);
    if (maxBufferedDocs != -1)
        iwc.setMaxBufferedDocs(maxBufferedDocs);

    if (ramBufferSizeMB != -1)
        iwc.setRAMBufferSizeMB(ramBufferSizeMB);

    if (termIndexInterval != -1)
        iwc.setTermIndexInterval(termIndexInterval);

    if (writeLockTimeout != -1)
        iwc.setWriteLockTimeout(writeLockTimeout);

    iwc.setSimilarity(schema.getSimilarity());
    iwc.setMergePolicy(buildMergePolicy(schema));
    iwc.setMergeScheduler(buildMergeScheduler(schema));
    iwc.setInfoStream(infoStream);/*ww w.j  a v a 2s . c  om*/

    // do this after buildMergePolicy since the backcompat logic 
    // there may modify the effective useCompoundFile
    iwc.setUseCompoundFile(getUseCompoundFile());

    if (maxIndexingThreads != -1) {
        iwc.setMaxThreadStates(maxIndexingThreads);
    }

    if (mergedSegmentWarmerInfo != null) {
        // TODO: add infostream -> normal logging system (there is an issue somewhere)
        IndexReaderWarmer warmer = schema.getResourceLoader().newInstance(mergedSegmentWarmerInfo.className,
                IndexReaderWarmer.class, null, new Class[] { InfoStream.class },
                new Object[] { iwc.getInfoStream() });
        iwc.setMergedSegmentWarmer(warmer);
    }

    return iwc;
}

From source file:org.codice.ddf.spatial.geocoding.index.GeoNamesLuceneIndexer.java

License:Open Source License

IndexWriter createIndexWriter(final boolean create, final Directory directory) throws IOException {

    final IndexWriterConfig indexWriterConfig = new IndexWriterConfig(ANALYZER);

    // Set to CREATE mode if the index does not exist.
    if (!DirectoryReader.indexExists(directory)) {
        indexWriterConfig.setOpenMode(OpenMode.CREATE);
    } else {/*from w w  w.  j  a v  a  2 s  . c  om*/
        indexWriterConfig.setOpenMode(create ? OpenMode.CREATE : OpenMode.APPEND);
    }
    indexWriterConfig.setSimilarity(SIMILARITY);
    return new IndexWriter(directory, indexWriterConfig);
}

From source file:org.elasticsearch.index.engine.internal.AsynchronousEngine.java

License:Apache License

private IndexWriter createWriter() throws IOException {
    try {//from  w  ww  .  ja v a  2 s. c  om
        boolean create = !Lucene.indexExists(store.directory());
        IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION,
                analysisService.defaultIndexAnalyzer());
        config.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND);
        config.setIndexDeletionPolicy(deletionPolicy);
        config.setInfoStream(new LoggerInfoStream(indexSettings, shardId));
        config.setMergeScheduler(mergeScheduler.newMergeScheduler());
        MergePolicy mergePolicy = mergePolicyProvider.getMergePolicy();
        // Give us the opportunity to upgrade old segments while performing
        // background merges
        mergePolicy = new ElasticsearchMergePolicy(mergePolicy);
        config.setMergePolicy(mergePolicy);
        config.setSimilarity(similarityService.similarity());
        config.setRAMBufferSizeMB(indexingBufferSize.mbFrac());
        config.setMaxThreadStates(indexConcurrency);
        config.setCodec(codecService.codec(codecName));
        /* We set this timeout to a highish value to work around
         * the default poll interval in the Lucene lock that is
         * 1000ms by default. We might need to poll multiple times
         * here but with 1s poll this is only executed twice at most
         * in combination with the default writelock timeout*/
        config.setWriteLockTimeout(5000);
        config.setUseCompoundFile(this.compoundOnFlush);
        config.setCheckIntegrityAtMerge(checksumOnMerge);
        // Warm-up hook for newly-merged segments. Warming up segments here is better since it will be performed at the end
        // of the merge operation and won't slow down _refresh
        config.setMergedSegmentWarmer(new IndexReaderWarmer() {
            @Override
            public void warm(AtomicReader reader) throws IOException {
                try {
                    assert isMergedSegment(reader);
                    if (warmer != null) {
                        final Engine.Searcher searcher = new SimpleSearcher("warmer",
                                new IndexSearcher(reader));
                        final IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId,
                                searcher);
                        warmer.warmNewReaders(context);
                    }
                } catch (Throwable t) {
                    // Don't fail a merge if the warm-up failed
                    if (!closed) {
                        logger.warn("Warm-up failed", t);
                    }
                    if (t instanceof Error) {
                        // assertion/out-of-memory error, don't ignore those
                        throw (Error) t;
                    }
                }
            }
        });
        return new IndexWriter(store.directory(), config);
    } catch (LockObtainFailedException ex) {
        boolean isLocked = IndexWriter.isLocked(store.directory());
        logger.warn("Could not lock IndexWriter isLocked [{}]", ex, isLocked);
        throw ex;
    }
}

From source file:org.elasticsearch.index.engine.internal.InternalEngine.java

License:Apache License

private IndexWriter createWriter() throws IOException {
    try {/*from   ww w.  j  a va  2  s  .c om*/
        // release locks when started
        if (IndexWriter.isLocked(store.directory())) {
            logger.warn("shard is locked, releasing lock");
            IndexWriter.unlock(store.directory());
        }
        boolean create = !Lucene.indexExists(store.directory());
        IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION,
                analysisService.defaultIndexAnalyzer());
        config.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND);
        config.setIndexDeletionPolicy(deletionPolicy);
        config.setMergeScheduler(mergeScheduler.newMergeScheduler());
        MergePolicy mergePolicy = mergePolicyProvider.newMergePolicy();
        // Give us the opportunity to upgrade old segments while performing
        // background merges
        mergePolicy = new IndexUpgraderMergePolicy(mergePolicy);
        config.setMergePolicy(mergePolicy);
        config.setSimilarity(similarityService.similarity());
        config.setRAMBufferSizeMB(indexingBufferSize.mbFrac());
        config.setMaxThreadStates(indexConcurrency);
        config.setCodec(codecService.codec(codecName));
        /* We set this timeout to a highish value to work around
         * the default poll interval in the Lucene lock that is
         * 1000ms by default. We might need to poll multiple times
         * here but with 1s poll this is only executed twice at most
         * in combination with the default writelock timeout*/
        config.setWriteLockTimeout(5000);
        config.setUseCompoundFile(this.compoundOnFlush);
        // Warm-up hook for newly-merged segments. Warming up segments here is better since it will be performed at the end
        // of the merge operation and won't slow down _refresh
        config.setMergedSegmentWarmer(new IndexReaderWarmer() {
            @Override
            public void warm(AtomicReader reader) throws IOException {
                try {
                    assert isMergedSegment(reader);
                    final Engine.Searcher searcher = new SimpleSearcher("warmer", new IndexSearcher(reader));
                    final IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId,
                            searcher);
                    if (warmer != null)
                        warmer.warm(context);
                } catch (Throwable t) {
                    // Don't fail a merge if the warm-up failed
                    if (!closed) {
                        logger.warn("Warm-up failed", t);
                    }
                    if (t instanceof Error) {
                        // assertion/out-of-memory error, don't ignore those
                        throw (Error) t;
                    }
                }
            }
        });
        return new IndexWriter(store.directory(), config);
    } catch (LockObtainFailedException ex) {
        boolean isLocked = IndexWriter.isLocked(store.directory());
        logger.warn("Could not lock IndexWriter isLocked [{}]", ex, isLocked);
        throw ex;
    }
}

From source file:org.elasticsearch.index.engine.InternalEngine.java

License:Apache License

private IndexWriter createWriter(boolean create) throws IOException {
    try {//from   ww  w. j a  va2  s.com
        final IndexWriterConfig iwc = new IndexWriterConfig(engineConfig.getAnalyzer());
        iwc.setCommitOnClose(false); // we by default don't commit on close
        iwc.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND);
        iwc.setIndexDeletionPolicy(deletionPolicy);
        // with tests.verbose, lucene sets this up: plumb to align with filesystem stream
        boolean verbose = false;
        try {
            verbose = Boolean.parseBoolean(System.getProperty("tests.verbose"));
        } catch (Throwable ignore) {
        }
        iwc.setInfoStream(verbose ? InfoStream.getDefault() : new LoggerInfoStream(logger));
        iwc.setMergeScheduler(mergeScheduler);
        MergePolicy mergePolicy = config().getMergePolicy();
        // Give us the opportunity to upgrade old segments while performing
        // background merges
        mergePolicy = new ElasticsearchMergePolicy(mergePolicy);
        iwc.setMergePolicy(mergePolicy);
        iwc.setSimilarity(engineConfig.getSimilarity());
        iwc.setRAMBufferSizeMB(engineConfig.getIndexingBufferSize().mbFrac());
        iwc.setCodec(engineConfig.getCodec());
        /* We set this timeout to a highish value to work around
         * the default poll interval in the Lucene lock that is
         * 1000ms by default. We might need to poll multiple times
         * here but with 1s poll this is only executed twice at most
         * in combination with the default writelock timeout*/
        iwc.setWriteLockTimeout(5000);
        iwc.setUseCompoundFile(this.engineConfig.isCompoundOnFlush());
        // Warm-up hook for newly-merged segments. Warming up segments here is better since it will be performed at the end
        // of the merge operation and won't slow down _refresh
        iwc.setMergedSegmentWarmer(new IndexReaderWarmer() {
            @Override
            public void warm(LeafReader reader) throws IOException {
                try {
                    LeafReader esLeafReader = new ElasticsearchLeafReader(reader, shardId);
                    assert isMergedSegment(esLeafReader);
                    if (warmer != null) {
                        final Engine.Searcher searcher = new Searcher("warmer",
                                searcherFactory.newSearcher(esLeafReader, null));
                        final IndicesWarmer.WarmerContext context = new IndicesWarmer.WarmerContext(shardId,
                                searcher);
                        warmer.warmNewReaders(context);
                    }
                } catch (Throwable t) {
                    // Don't fail a merge if the warm-up failed
                    if (isClosed.get() == false) {
                        logger.warn("Warm-up failed", t);
                    }
                    if (t instanceof Error) {
                        // assertion/out-of-memory error, don't ignore those
                        throw (Error) t;
                    }
                }
            }
        });
        return new IndexWriter(store.directory(), iwc);
    } catch (LockObtainFailedException ex) {
        boolean isLocked = IndexWriter.isLocked(store.directory());
        logger.warn("Could not lock IndexWriter isLocked [{}]", ex, isLocked);
        throw ex;
    }
}

From source file:org.elasticsearch.index.engine.robin.RobinEngine.java

License:Apache License

private IndexWriter createWriter() throws IOException {
    IndexWriter indexWriter = null;// w w w  .  jav a2  s . c  o  m
    try {
        // release locks when started
        if (IndexWriter.isLocked(store.directory())) {
            logger.warn("shard is locked, releasing lock");
            IndexWriter.unlock(store.directory());
        }
        boolean create = !IndexReader.indexExists(store.directory());
        IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION,
                analysisService.defaultIndexAnalyzer());
        config.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND);
        config.setIndexDeletionPolicy(deletionPolicy);
        config.setMergeScheduler(mergeScheduler.newMergeScheduler());
        config.setMergePolicy(mergePolicyProvider.newMergePolicy());
        config.setSimilarity(similarityService.defaultIndexSimilarity());
        config.setRAMBufferSizeMB(indexingBufferSize.mbFrac());
        config.setTermIndexInterval(termIndexInterval);
        config.setReaderTermsIndexDivisor(termIndexDivisor);
        config.setMaxThreadStates(indexConcurrency);

        indexWriter = new IndexWriter(store.directory(), config);
    } catch (IOException e) {
        safeClose(indexWriter);
        throw e;
    }
    return indexWriter;
}

From source file:org.exoplatform.services.jcr.impl.core.query.lucene.AbstractIndex.java

License:Apache License

/**
 * Returns an <code>IndexWriter</code> on this index.
 * @return an <code>IndexWriter</code> on this index.
 * @throws IOException if the writer cannot be obtained.
 *//*from  w w w .ja  va  2 s .c o m*/
protected synchronized IndexWriter getIndexWriter() throws IOException {
    if (indexReader != null) {
        indexReader.close();
        log.debug("closing IndexReader.");
        indexReader = null;
    }
    if (indexWriter == null) {
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_30, analyzer);
        config.setSimilarity(similarity);
        if (config.getMergePolicy() instanceof LogMergePolicy) {
            ((LogMergePolicy) config.getMergePolicy()).setUseCompoundFile(useCompoundFile);
        } else {
            log.error("Can't set \"UseCompoundFile\". Merge policy is not an instance of LogMergePolicy. ");
        }
        indexWriter = new IndexWriter(directory, config);
        setUseCompoundFile(useCompoundFile);
        indexWriter.setInfoStream(STREAM_LOGGER);
    }
    return indexWriter;
}