Example usage for org.apache.lucene.index IndexWriterConfig getSimilarity

List of usage examples for org.apache.lucene.index IndexWriterConfig getSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriterConfig getSimilarity.

Prototype

@Override
    public Similarity getSimilarity() 

Source Link

Usage

From source file:edu.umass.cs.ciir.IndexFromGalago.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Parameters argp = Parameters.parseArgs(args);
    String galagoIndexPath = null;
    String luceneIndexPath = null;
    try {/* w ww  .  j a  v  a 2 s .c om*/
        galagoIndexPath = argp.getString("galagoIndex");
        luceneIndexPath = argp.getString("luceneIndex");
    } catch (Exception e) {
        System.out.println(getUsage());
        return;
    }

    logger.setUseParentHandlers(false);
    FileHandler lfh = new FileHandler("indexing-errors.log");
    SimpleFormatter formatter = new SimpleFormatter();
    lfh.setFormatter(formatter);
    logger.addHandler(lfh);

    final DiskIndex index = new DiskIndex(argp.get("index", galagoIndexPath));
    final CorpusReader corpus = (CorpusReader) index.getIndexPart("corpus");
    long total = corpus.getManifest().getLong("keyCount");
    final CorpusReader.KeyIterator iterator = corpus.getIterator();

    final Document.DocumentComponents dcp = Document.DocumentComponents.JustText;
    // Analyzer includes options for text processing
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            // Step 1: tokenization (Lucene's StandardTokenizer is suitable for most text retrieval occasions)
            TokenStreamComponents ts = new TokenStreamComponents(new StandardTokenizer());
            // Step 2: transforming all tokens into lowercased ones
            ts = new Analyzer.TokenStreamComponents(ts.getTokenizer(),
                    new LowerCaseFilter(ts.getTokenStream()));
            // Step 3: whether to remove stop words
            // Uncomment the following line to remove stop words
            // ts = new TokenStreamComponents( ts.getTokenizer(), new StopwordsFilter( ts.getTokenStream(), StandardAnalyzer.ENGLISH_STOP_WORDS_SET ) );
            // Step 4: whether to apply stemming
            // Uncomment the following line to apply Krovetz or Porter stemmer
            // ts = new TokenStreamComponents( ts.getTokenizer(), new KStemFilter( ts.getTokenStream() ) );
            // ts = new TokenStreamComponents( ts.getTokenizer(), new PorterStemFilter( ts.getTokenStream() ) );
            return ts;
        }
    };

    try (final FSDirectory dir = FSDirectory.open(Paths.get(argp.get("output", luceneIndexPath)))) {
        final IndexWriterConfig cfg = new IndexWriterConfig(analyzer);
        System.out.println("Similarity: " + cfg.getSimilarity());
        cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        try (IndexWriter writer = new IndexWriter(dir, cfg)) {
            iterator.forAllKeyStrings(docId -> {
                try {
                    Document document = iterator.getDocument(dcp);

                    String text = document.text;
                    String id = document.name;
                    System.out.println("Processing document: " + id);
                    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
                    doc.add(new StringField("id", id, Field.Store.YES));
                    // this stores the actual text with tags so formatting is preserved
                    doc.add(new StoredField("body", text));
                    org.jsoup.nodes.Document jsoup = Jsoup.parse(text);

                    // tokens of the document
                    FieldType fieldTypeText = new FieldType();
                    fieldTypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
                    fieldTypeText.setStoreTermVectors(true);
                    fieldTypeText.setStoreTermVectorPositions(true);
                    fieldTypeText.setTokenized(true);
                    fieldTypeText.setStored(false);
                    fieldTypeText.freeze();
                    doc.add(new Field("tokens", jsoup.text(), fieldTypeText));

                    try {
                        writer.addDocument(doc);
                        System.out.println("Doc count: " + writer.numDocs());
                    } catch (IOException e) {
                        logger.log(Level.WARNING, "Pull-Document-Exception", e);
                        System.err.println(e.toString());
                    }

                } catch (Exception e) {
                    logger.log(Level.WARNING, "Pull-Document-Exception", e);
                    System.err.println(e.toString());
                }
            });

        }
    }

    System.out.println("Indexing Done. ");
}

From source file:org.elasticsearch.index.engine.EngineTestCase.java

License:Apache License

public EngineConfig config(IndexSettings indexSettings, Store store, Path translogPath, MergePolicy mergePolicy,
        ReferenceManager.RefreshListener refreshListener, Sort indexSort,
        LongSupplier globalCheckpointSupplier) {
    IndexWriterConfig iwc = newIndexWriterConfig();
    TranslogConfig translogConfig = new TranslogConfig(shardId, translogPath, indexSettings,
            BigArrays.NON_RECYCLING_INSTANCE);
    Engine.EventListener listener = new Engine.EventListener() {
        @Override//from   www  . j  ava 2  s  .  c  om
        public void onFailedEngine(String reason, @Nullable Exception e) {
            // we don't need to notify anybody in this test
        }
    };
    final TranslogHandler handler = new TranslogHandler(xContentRegistry(),
            IndexSettingsModule.newIndexSettings(shardId.getIndexName(), indexSettings.getSettings()));
    final List<ReferenceManager.RefreshListener> refreshListenerList = refreshListener == null ? emptyList()
            : Collections.singletonList(refreshListener);
    EngineConfig config = new EngineConfig(shardId, allocationId.getId(), threadPool, indexSettings, null,
            store, mergePolicy, iwc.getAnalyzer(), iwc.getSimilarity(), new CodecService(null, logger),
            listener, IndexSearcher.getDefaultQueryCache(), IndexSearcher.getDefaultQueryCachingPolicy(),
            translogConfig, TimeValue.timeValueMinutes(5), refreshListenerList, Collections.emptyList(),
            indexSort, handler, new NoneCircuitBreakerService(),
            globalCheckpointSupplier == null
                    ? new ReplicationTracker(shardId, allocationId.getId(), indexSettings,
                            SequenceNumbers.NO_OPS_PERFORMED)
                    : globalCheckpointSupplier);
    return config;
}

From source file:org.elasticsearch.index.engine.InternalEngineTests.java

License:Apache License

public EngineConfig config(Settings indexSettings, Store store, Path translogPath,
        MergeSchedulerConfig mergeSchedulerConfig, MergePolicy mergePolicy, IndexSearcherWrapper... wrappers) {
    IndexWriterConfig iwc = newIndexWriterConfig();
    TranslogConfig translogConfig = new TranslogConfig(shardId, translogPath, indexSettings,
            Translog.Durabilty.REQUEST, BigArrays.NON_RECYCLING_INSTANCE, threadPool);

    EngineConfig config = new EngineConfig(shardId, threadPool,
            new ShardIndexingService(shardId, indexSettings), indexSettings, null, store,
            createSnapshotDeletionPolicy(), mergePolicy, mergeSchedulerConfig, iwc.getAnalyzer(),
            iwc.getSimilarity(), new CodecService(shardId.index()), new Engine.FailedEngineListener() {
                @Override//from w ww.j a v  a  2 s.  c om
                public void onFailedEngine(ShardId shardId, String reason, @Nullable Throwable t) {
                    // we don't need to notify anybody in this test
                }
            }, new TranslogHandler(shardId.index().getName(), logger), IndexSearcher.getDefaultQueryCache(),
            IndexSearcher.getDefaultQueryCachingPolicy(),
            new IndexSearcherWrappingService(new HashSet<>(Arrays.asList(wrappers))), translogConfig);
    try {
        config.setCreate(Lucene.indexExists(store.directory()) == false);
    } catch (IOException e) {
        throw new ElasticsearchException("can't find index?", e);
    }
    return config;
}

From source file:org.elasticsearch.index.engine.ShadowEngineTests.java

License:Apache License

public EngineConfig config(IndexSettingsService indexSettingsService, Store store, Translog translog,
        MergeSchedulerProvider mergeSchedulerProvider) {
    IndexWriterConfig iwc = newIndexWriterConfig(Lucene.STANDARD_ANALYZER);
    EngineConfig config = new EngineConfig(shardId,
            false/*per default optimization for auto generated ids is disabled*/, threadPool,
            new ShardIndexingService(shardId, EMPTY_SETTINGS,
                    new ShardSlowLogIndexingService(shardId, EMPTY_SETTINGS, indexSettingsService)),
            indexSettingsService, null, store, createSnapshotDeletionPolicy(), translog, createMergePolicy(),
            mergeSchedulerProvider, iwc.getAnalyzer(), iwc.getSimilarity(), new CodecService(shardId.index()),
            new Engine.FailedEngineListener() {
                @Override/*from  ww  w. j  av  a 2s  .c  o m*/
                public void onFailedEngine(ShardId shardId, String reason, @Nullable Throwable t) {
                    // we don't need to notify anybody in this test
                }
            });

    return config;
}

From source file:org.elasticsearch.index.shard.IndexSearcherWrapperTests.java

License:Apache License

public void testIsCacheable() throws IOException {
    Directory dir = newDirectory();// w ww.  java  2  s  .  c  om
    IndexWriterConfig iwc = newIndexWriterConfig();
    IndexWriter writer = new IndexWriter(dir, iwc);
    Document doc = new Document();
    doc.add(new StringField("id", "1", random().nextBoolean() ? Field.Store.YES : Field.Store.NO));
    doc.add(new TextField("field", "doc", random().nextBoolean() ? Field.Store.YES : Field.Store.NO));
    writer.addDocument(doc);
    DirectoryReader open = ElasticsearchDirectoryReader.wrap(DirectoryReader.open(writer, true),
            new ShardId("foo", 1));
    IndexSearcher searcher = new IndexSearcher(open);
    assertEquals(1, searcher.search(new TermQuery(new Term("field", "doc")), 1).totalHits);
    searcher.setSimilarity(iwc.getSimilarity());
    final AtomicInteger closeCalls = new AtomicInteger(0);
    IndexSearcherWrapper wrapper = new IndexSearcherWrapper() {
        @Override
        public DirectoryReader wrap(DirectoryReader reader) throws IOException {
            return new FieldMaskingReader("field", reader, closeCalls);
        }

        @Override
        public IndexSearcher wrap(EngineConfig engineConfig, IndexSearcher searcher) throws EngineException {
            return searcher;
        }
    };
    final ConcurrentHashMap<Object, TopDocs> cache = new ConcurrentHashMap<>();
    try (Engine.Searcher engineSearcher = new Engine.Searcher("foo", searcher)) {
        try (final Engine.Searcher wrap = new IndexSearcherWrappingService(Collections.singleton(wrapper))
                .wrap(ENGINE_CONFIG, engineSearcher)) {
            ElasticsearchDirectoryReader.addReaderCloseListener(wrap.getDirectoryReader(),
                    new IndexReader.ReaderClosedListener() {
                        @Override
                        public void onClose(IndexReader reader) throws IOException {
                            cache.remove(reader.getCoreCacheKey());
                        }
                    });
            TopDocs search = wrap.searcher().search(new TermQuery(new Term("field", "doc")), 1);
            cache.put(wrap.reader().getCoreCacheKey(), search);
        }
    }
    assertEquals(1, closeCalls.get());

    assertEquals(1, cache.size());
    IOUtils.close(open, writer, dir);
    assertEquals(0, cache.size());
    assertEquals(1, closeCalls.get());
}

From source file:org.elasticsearch.index.shard.IndexSearcherWrapperTests.java

License:Apache License

public void testNoWrap() throws IOException {
    Directory dir = newDirectory();//  ww w . j  a  v a 2s  . c  o m
    IndexWriterConfig iwc = newIndexWriterConfig();
    IndexWriter writer = new IndexWriter(dir, iwc);
    Document doc = new Document();
    doc.add(new StringField("id", "1", random().nextBoolean() ? Field.Store.YES : Field.Store.NO));
    doc.add(new TextField("field", "doc", random().nextBoolean() ? Field.Store.YES : Field.Store.NO));
    writer.addDocument(doc);
    DirectoryReader open = ElasticsearchDirectoryReader.wrap(DirectoryReader.open(writer, true),
            new ShardId("foo", 1));
    IndexSearcher searcher = new IndexSearcher(open);
    assertEquals(1, searcher.search(new TermQuery(new Term("field", "doc")), 1).totalHits);
    searcher.setSimilarity(iwc.getSimilarity());
    IndexSearcherWrapper wrapper = new IndexSearcherWrapper() {
        @Override
        public DirectoryReader wrap(DirectoryReader reader) throws IOException {
            return reader;
        }

        @Override
        public IndexSearcher wrap(EngineConfig engineConfig, IndexSearcher searcher) throws EngineException {
            return searcher;
        }
    };
    try (Engine.Searcher engineSearcher = new Engine.Searcher("foo", searcher)) {
        final Engine.Searcher wrap = new IndexSearcherWrappingService(Collections.singleton(wrapper))
                .wrap(ENGINE_CONFIG, engineSearcher);
        assertSame(wrap, engineSearcher);
    }
    IOUtils.close(open, writer, dir);
}

From source file:org.elasticsearch.index.shard.IndexSearcherWrapperTests.java

License:Apache License

public void testWrappedReaderMustDelegateCoreCacheKey() throws IOException {
    Directory dir = newDirectory();//from   w w w  . j a  va  2 s. com
    IndexWriterConfig iwc = newIndexWriterConfig();
    IndexWriter writer = new IndexWriter(dir, iwc);
    Document doc = new Document();
    doc.add(new StringField("id", "1", random().nextBoolean() ? Field.Store.YES : Field.Store.NO));
    doc.add(new TextField("field", "doc", random().nextBoolean() ? Field.Store.YES : Field.Store.NO));
    writer.addDocument(doc);
    DirectoryReader open = ElasticsearchDirectoryReader.wrap(DirectoryReader.open(writer, true),
            new ShardId("foo", 1));
    IndexSearcher searcher = new IndexSearcher(open);
    assertEquals(1, searcher.search(new TermQuery(new Term("field", "doc")), 1).totalHits);
    searcher.setSimilarity(iwc.getSimilarity());
    IndexSearcherWrapper wrapper = new IndexSearcherWrapper() {
        @Override
        public DirectoryReader wrap(DirectoryReader reader) throws IOException {
            return new BrokenWrapper(reader, false);
        }

        @Override
        public IndexSearcher wrap(EngineConfig engineConfig, IndexSearcher searcher) throws EngineException {
            return searcher;
        }
    };
    try (Engine.Searcher engineSearcher = new Engine.Searcher("foo", searcher)) {
        try {
            new IndexSearcherWrappingService(Collections.singleton(wrapper)).wrap(ENGINE_CONFIG,
                    engineSearcher);
            fail("reader must delegate cache key");
        } catch (IllegalStateException ex) {
            // all is well
        }
    }
    wrapper = new IndexSearcherWrapper() {
        @Override
        public DirectoryReader wrap(DirectoryReader reader) throws IOException {
            return new BrokenWrapper(reader, true);
        }

        @Override
        public IndexSearcher wrap(EngineConfig engineConfig, IndexSearcher searcher) throws EngineException {
            return searcher;
        }
    };
    try (Engine.Searcher engineSearcher = new Engine.Searcher("foo", searcher)) {
        try {
            new IndexSearcherWrappingService(Collections.singleton(wrapper)).wrap(ENGINE_CONFIG,
                    engineSearcher);
            fail("reader must delegate cache key");
        } catch (IllegalStateException ex) {
            // all is well
        }
    }
    IOUtils.close(open, writer, dir);
}

From source file:org.elasticsearch.index.shard.RefreshListenersTests.java

License:Apache License

@Before
public void setupListeners() throws Exception {
    // Setup dependencies of the listeners
    maxListeners = randomIntBetween(1, 1000);
    listeners = new RefreshListeners(() -> maxListeners, () -> engine.refresh("too-many-listeners"),
            // Immediately run listeners rather than adding them to the listener thread pool like IndexShard does to simplify the test.
            Runnable::run, logger);

    // Now setup the InternalEngine which is much more complicated because we aren't mocking anything
    threadPool = new TestThreadPool(getTestName());
    IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", Settings.EMPTY);
    ShardId shardId = new ShardId(new Index("index", "_na_"), 1);
    Directory directory = newDirectory();
    DirectoryService directoryService = new DirectoryService(shardId, indexSettings) {
        @Override/*from   ww w .j a  va2  s.c  o m*/
        public Directory newDirectory() throws IOException {
            return directory;
        }

        @Override
        public long throttleTimeInNanos() {
            return 0;
        }
    };
    store = new Store(shardId, indexSettings, directoryService, new DummyShardLock(shardId));
    IndexWriterConfig iwc = newIndexWriterConfig();
    TranslogConfig translogConfig = new TranslogConfig(shardId, createTempDir("translog"), indexSettings,
            BigArrays.NON_RECYCLING_INSTANCE);
    Engine.EventListener eventListener = new Engine.EventListener() {
        @Override
        public void onFailedEngine(String reason, @Nullable Exception e) {
            // we don't need to notify anybody in this test
        }
    };
    EngineConfig config = new EngineConfig(EngineConfig.OpenMode.CREATE_INDEX_AND_TRANSLOG, shardId, threadPool,
            indexSettings, null, store, new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy()),
            newMergePolicy(), iwc.getAnalyzer(), iwc.getSimilarity(), new CodecService(null, logger),
            eventListener, new TranslogHandler(shardId.getIndexName(), logger),
            IndexSearcher.getDefaultQueryCache(), IndexSearcher.getDefaultQueryCachingPolicy(), translogConfig,
            TimeValue.timeValueMinutes(5), listeners);
    engine = new InternalEngine(config);
}