List of usage examples for org.apache.lucene.index IndexWriterConfig getSimilarity
@Override
public Similarity getSimilarity()
From source file:edu.umass.cs.ciir.IndexFromGalago.java
License:Open Source License
public static void main(String[] args) throws Exception { Parameters argp = Parameters.parseArgs(args); String galagoIndexPath = null; String luceneIndexPath = null; try {/* w ww . j a v a 2 s .c om*/ galagoIndexPath = argp.getString("galagoIndex"); luceneIndexPath = argp.getString("luceneIndex"); } catch (Exception e) { System.out.println(getUsage()); return; } logger.setUseParentHandlers(false); FileHandler lfh = new FileHandler("indexing-errors.log"); SimpleFormatter formatter = new SimpleFormatter(); lfh.setFormatter(formatter); logger.addHandler(lfh); final DiskIndex index = new DiskIndex(argp.get("index", galagoIndexPath)); final CorpusReader corpus = (CorpusReader) index.getIndexPart("corpus"); long total = corpus.getManifest().getLong("keyCount"); final CorpusReader.KeyIterator iterator = corpus.getIterator(); final Document.DocumentComponents dcp = Document.DocumentComponents.JustText; // Analyzer includes options for text processing Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { // Step 1: tokenization (Lucene's StandardTokenizer is suitable for most text retrieval occasions) TokenStreamComponents ts = new TokenStreamComponents(new StandardTokenizer()); // Step 2: transforming all tokens into lowercased ones ts = new Analyzer.TokenStreamComponents(ts.getTokenizer(), new LowerCaseFilter(ts.getTokenStream())); // Step 3: whether to remove stop words // Uncomment the following line to remove stop words // ts = new TokenStreamComponents( ts.getTokenizer(), new StopwordsFilter( ts.getTokenStream(), StandardAnalyzer.ENGLISH_STOP_WORDS_SET ) ); // Step 4: whether to apply stemming // Uncomment the following line to apply Krovetz or Porter stemmer // ts = new TokenStreamComponents( ts.getTokenizer(), new KStemFilter( ts.getTokenStream() ) ); // ts = new TokenStreamComponents( ts.getTokenizer(), new PorterStemFilter( ts.getTokenStream() ) ); return ts; } }; try (final FSDirectory dir = FSDirectory.open(Paths.get(argp.get("output", luceneIndexPath)))) { final IndexWriterConfig cfg = new IndexWriterConfig(analyzer); System.out.println("Similarity: " + cfg.getSimilarity()); cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE); try (IndexWriter writer = new IndexWriter(dir, cfg)) { iterator.forAllKeyStrings(docId -> { try { Document document = iterator.getDocument(dcp); String text = document.text; String id = document.name; System.out.println("Processing document: " + id); org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document(); doc.add(new StringField("id", id, Field.Store.YES)); // this stores the actual text with tags so formatting is preserved doc.add(new StoredField("body", text)); org.jsoup.nodes.Document jsoup = Jsoup.parse(text); // tokens of the document FieldType fieldTypeText = new FieldType(); fieldTypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); fieldTypeText.setStoreTermVectors(true); fieldTypeText.setStoreTermVectorPositions(true); fieldTypeText.setTokenized(true); fieldTypeText.setStored(false); fieldTypeText.freeze(); doc.add(new Field("tokens", jsoup.text(), fieldTypeText)); try { writer.addDocument(doc); System.out.println("Doc count: " + writer.numDocs()); } catch (IOException e) { logger.log(Level.WARNING, "Pull-Document-Exception", e); System.err.println(e.toString()); } } catch (Exception e) { logger.log(Level.WARNING, "Pull-Document-Exception", e); System.err.println(e.toString()); } }); } } System.out.println("Indexing Done. "); }
From source file:org.elasticsearch.index.engine.EngineTestCase.java
License:Apache License
public EngineConfig config(IndexSettings indexSettings, Store store, Path translogPath, MergePolicy mergePolicy, ReferenceManager.RefreshListener refreshListener, Sort indexSort, LongSupplier globalCheckpointSupplier) { IndexWriterConfig iwc = newIndexWriterConfig(); TranslogConfig translogConfig = new TranslogConfig(shardId, translogPath, indexSettings, BigArrays.NON_RECYCLING_INSTANCE); Engine.EventListener listener = new Engine.EventListener() { @Override//from www . j ava 2 s . c om public void onFailedEngine(String reason, @Nullable Exception e) { // we don't need to notify anybody in this test } }; final TranslogHandler handler = new TranslogHandler(xContentRegistry(), IndexSettingsModule.newIndexSettings(shardId.getIndexName(), indexSettings.getSettings())); final List<ReferenceManager.RefreshListener> refreshListenerList = refreshListener == null ? emptyList() : Collections.singletonList(refreshListener); EngineConfig config = new EngineConfig(shardId, allocationId.getId(), threadPool, indexSettings, null, store, mergePolicy, iwc.getAnalyzer(), iwc.getSimilarity(), new CodecService(null, logger), listener, IndexSearcher.getDefaultQueryCache(), IndexSearcher.getDefaultQueryCachingPolicy(), translogConfig, TimeValue.timeValueMinutes(5), refreshListenerList, Collections.emptyList(), indexSort, handler, new NoneCircuitBreakerService(), globalCheckpointSupplier == null ? new ReplicationTracker(shardId, allocationId.getId(), indexSettings, SequenceNumbers.NO_OPS_PERFORMED) : globalCheckpointSupplier); return config; }
From source file:org.elasticsearch.index.engine.InternalEngineTests.java
License:Apache License
public EngineConfig config(Settings indexSettings, Store store, Path translogPath, MergeSchedulerConfig mergeSchedulerConfig, MergePolicy mergePolicy, IndexSearcherWrapper... wrappers) { IndexWriterConfig iwc = newIndexWriterConfig(); TranslogConfig translogConfig = new TranslogConfig(shardId, translogPath, indexSettings, Translog.Durabilty.REQUEST, BigArrays.NON_RECYCLING_INSTANCE, threadPool); EngineConfig config = new EngineConfig(shardId, threadPool, new ShardIndexingService(shardId, indexSettings), indexSettings, null, store, createSnapshotDeletionPolicy(), mergePolicy, mergeSchedulerConfig, iwc.getAnalyzer(), iwc.getSimilarity(), new CodecService(shardId.index()), new Engine.FailedEngineListener() { @Override//from w ww.j a v a 2 s. c om public void onFailedEngine(ShardId shardId, String reason, @Nullable Throwable t) { // we don't need to notify anybody in this test } }, new TranslogHandler(shardId.index().getName(), logger), IndexSearcher.getDefaultQueryCache(), IndexSearcher.getDefaultQueryCachingPolicy(), new IndexSearcherWrappingService(new HashSet<>(Arrays.asList(wrappers))), translogConfig); try { config.setCreate(Lucene.indexExists(store.directory()) == false); } catch (IOException e) { throw new ElasticsearchException("can't find index?", e); } return config; }
From source file:org.elasticsearch.index.engine.ShadowEngineTests.java
License:Apache License
public EngineConfig config(IndexSettingsService indexSettingsService, Store store, Translog translog, MergeSchedulerProvider mergeSchedulerProvider) { IndexWriterConfig iwc = newIndexWriterConfig(Lucene.STANDARD_ANALYZER); EngineConfig config = new EngineConfig(shardId, false/*per default optimization for auto generated ids is disabled*/, threadPool, new ShardIndexingService(shardId, EMPTY_SETTINGS, new ShardSlowLogIndexingService(shardId, EMPTY_SETTINGS, indexSettingsService)), indexSettingsService, null, store, createSnapshotDeletionPolicy(), translog, createMergePolicy(), mergeSchedulerProvider, iwc.getAnalyzer(), iwc.getSimilarity(), new CodecService(shardId.index()), new Engine.FailedEngineListener() { @Override/*from ww w. j av a 2s .c o m*/ public void onFailedEngine(ShardId shardId, String reason, @Nullable Throwable t) { // we don't need to notify anybody in this test } }); return config; }
From source file:org.elasticsearch.index.shard.IndexSearcherWrapperTests.java
License:Apache License
public void testIsCacheable() throws IOException { Directory dir = newDirectory();// w ww. java 2 s . c om IndexWriterConfig iwc = newIndexWriterConfig(); IndexWriter writer = new IndexWriter(dir, iwc); Document doc = new Document(); doc.add(new StringField("id", "1", random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add(new TextField("field", "doc", random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); writer.addDocument(doc); DirectoryReader open = ElasticsearchDirectoryReader.wrap(DirectoryReader.open(writer, true), new ShardId("foo", 1)); IndexSearcher searcher = new IndexSearcher(open); assertEquals(1, searcher.search(new TermQuery(new Term("field", "doc")), 1).totalHits); searcher.setSimilarity(iwc.getSimilarity()); final AtomicInteger closeCalls = new AtomicInteger(0); IndexSearcherWrapper wrapper = new IndexSearcherWrapper() { @Override public DirectoryReader wrap(DirectoryReader reader) throws IOException { return new FieldMaskingReader("field", reader, closeCalls); } @Override public IndexSearcher wrap(EngineConfig engineConfig, IndexSearcher searcher) throws EngineException { return searcher; } }; final ConcurrentHashMap<Object, TopDocs> cache = new ConcurrentHashMap<>(); try (Engine.Searcher engineSearcher = new Engine.Searcher("foo", searcher)) { try (final Engine.Searcher wrap = new IndexSearcherWrappingService(Collections.singleton(wrapper)) .wrap(ENGINE_CONFIG, engineSearcher)) { ElasticsearchDirectoryReader.addReaderCloseListener(wrap.getDirectoryReader(), new IndexReader.ReaderClosedListener() { @Override public void onClose(IndexReader reader) throws IOException { cache.remove(reader.getCoreCacheKey()); } }); TopDocs search = wrap.searcher().search(new TermQuery(new Term("field", "doc")), 1); cache.put(wrap.reader().getCoreCacheKey(), search); } } assertEquals(1, closeCalls.get()); assertEquals(1, cache.size()); IOUtils.close(open, writer, dir); assertEquals(0, cache.size()); assertEquals(1, closeCalls.get()); }
From source file:org.elasticsearch.index.shard.IndexSearcherWrapperTests.java
License:Apache License
public void testNoWrap() throws IOException { Directory dir = newDirectory();// ww w . j a v a 2s . c o m IndexWriterConfig iwc = newIndexWriterConfig(); IndexWriter writer = new IndexWriter(dir, iwc); Document doc = new Document(); doc.add(new StringField("id", "1", random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add(new TextField("field", "doc", random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); writer.addDocument(doc); DirectoryReader open = ElasticsearchDirectoryReader.wrap(DirectoryReader.open(writer, true), new ShardId("foo", 1)); IndexSearcher searcher = new IndexSearcher(open); assertEquals(1, searcher.search(new TermQuery(new Term("field", "doc")), 1).totalHits); searcher.setSimilarity(iwc.getSimilarity()); IndexSearcherWrapper wrapper = new IndexSearcherWrapper() { @Override public DirectoryReader wrap(DirectoryReader reader) throws IOException { return reader; } @Override public IndexSearcher wrap(EngineConfig engineConfig, IndexSearcher searcher) throws EngineException { return searcher; } }; try (Engine.Searcher engineSearcher = new Engine.Searcher("foo", searcher)) { final Engine.Searcher wrap = new IndexSearcherWrappingService(Collections.singleton(wrapper)) .wrap(ENGINE_CONFIG, engineSearcher); assertSame(wrap, engineSearcher); } IOUtils.close(open, writer, dir); }
From source file:org.elasticsearch.index.shard.IndexSearcherWrapperTests.java
License:Apache License
public void testWrappedReaderMustDelegateCoreCacheKey() throws IOException { Directory dir = newDirectory();//from w w w . j a va 2 s. com IndexWriterConfig iwc = newIndexWriterConfig(); IndexWriter writer = new IndexWriter(dir, iwc); Document doc = new Document(); doc.add(new StringField("id", "1", random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); doc.add(new TextField("field", "doc", random().nextBoolean() ? Field.Store.YES : Field.Store.NO)); writer.addDocument(doc); DirectoryReader open = ElasticsearchDirectoryReader.wrap(DirectoryReader.open(writer, true), new ShardId("foo", 1)); IndexSearcher searcher = new IndexSearcher(open); assertEquals(1, searcher.search(new TermQuery(new Term("field", "doc")), 1).totalHits); searcher.setSimilarity(iwc.getSimilarity()); IndexSearcherWrapper wrapper = new IndexSearcherWrapper() { @Override public DirectoryReader wrap(DirectoryReader reader) throws IOException { return new BrokenWrapper(reader, false); } @Override public IndexSearcher wrap(EngineConfig engineConfig, IndexSearcher searcher) throws EngineException { return searcher; } }; try (Engine.Searcher engineSearcher = new Engine.Searcher("foo", searcher)) { try { new IndexSearcherWrappingService(Collections.singleton(wrapper)).wrap(ENGINE_CONFIG, engineSearcher); fail("reader must delegate cache key"); } catch (IllegalStateException ex) { // all is well } } wrapper = new IndexSearcherWrapper() { @Override public DirectoryReader wrap(DirectoryReader reader) throws IOException { return new BrokenWrapper(reader, true); } @Override public IndexSearcher wrap(EngineConfig engineConfig, IndexSearcher searcher) throws EngineException { return searcher; } }; try (Engine.Searcher engineSearcher = new Engine.Searcher("foo", searcher)) { try { new IndexSearcherWrappingService(Collections.singleton(wrapper)).wrap(ENGINE_CONFIG, engineSearcher); fail("reader must delegate cache key"); } catch (IllegalStateException ex) { // all is well } } IOUtils.close(open, writer, dir); }
From source file:org.elasticsearch.index.shard.RefreshListenersTests.java
License:Apache License
@Before public void setupListeners() throws Exception { // Setup dependencies of the listeners maxListeners = randomIntBetween(1, 1000); listeners = new RefreshListeners(() -> maxListeners, () -> engine.refresh("too-many-listeners"), // Immediately run listeners rather than adding them to the listener thread pool like IndexShard does to simplify the test. Runnable::run, logger); // Now setup the InternalEngine which is much more complicated because we aren't mocking anything threadPool = new TestThreadPool(getTestName()); IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("index", Settings.EMPTY); ShardId shardId = new ShardId(new Index("index", "_na_"), 1); Directory directory = newDirectory(); DirectoryService directoryService = new DirectoryService(shardId, indexSettings) { @Override/*from ww w .j a va2 s.c o m*/ public Directory newDirectory() throws IOException { return directory; } @Override public long throttleTimeInNanos() { return 0; } }; store = new Store(shardId, indexSettings, directoryService, new DummyShardLock(shardId)); IndexWriterConfig iwc = newIndexWriterConfig(); TranslogConfig translogConfig = new TranslogConfig(shardId, createTempDir("translog"), indexSettings, BigArrays.NON_RECYCLING_INSTANCE); Engine.EventListener eventListener = new Engine.EventListener() { @Override public void onFailedEngine(String reason, @Nullable Exception e) { // we don't need to notify anybody in this test } }; EngineConfig config = new EngineConfig(EngineConfig.OpenMode.CREATE_INDEX_AND_TRANSLOG, shardId, threadPool, indexSettings, null, store, new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy()), newMergePolicy(), iwc.getAnalyzer(), iwc.getSimilarity(), new CodecService(null, logger), eventListener, new TranslogHandler(shardId.getIndexName(), logger), IndexSearcher.getDefaultQueryCache(), IndexSearcher.getDefaultQueryCachingPolicy(), translogConfig, TimeValue.timeValueMinutes(5), listeners); engine = new InternalEngine(config); }