Java tutorial
/* * Copyright 2013 The greplin-lucene-utils Authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.greplin.lucene.filter; import com.google.common.base.Joiner; import com.greplin.lucene.query.BooleanQueryBuilder; import com.greplin.lucene.util.FilterIntersectionProvider; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.FilteredQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import java.io.IOException; import java.util.Random; /** * Random benchmark for the phrase filter. */ public class PhraseFilterBenchmark { private static final Random RANDOM = new Random(); private static final int TOTAL_DOCS = 10000; private static final int AVERAGE_WORDS_PER_DOC = 100; private static final int WORDS_PER_DOC_DEVIATION = 95; private static final int SECOND_FIELD_MATCH_PERCENTAGE = 50; private static final int ROUNDS = 4; private static final int TOTAL_QUERIES = 5000; private static final int[] WORDS_PER_QUERY = { 2, 2, 2, 2, 3, 3, 4, 5 }; private static final String FIELD = "f"; private static final String[] WORDS = ("the quick brown fox jumps over the lazy dog" + " a stitch in time saves nine" + " an apple a day keeps the doctor away" + " two wrongs do not make a right" + " the pen is mightier than the sword" + " the squeaky wheel gets the grease" + " no man is an island" + " fortune favors the bold" + " people who live in glass houses should not throw stones" + " better late than never" + " hope for the best but prepare for the worst" + " birds of a feather flock together" + " keep your friends close and your enemies closer" + " a picture is worth a thousand words" + " there is no such thing as a free lunch" + " there is no place like home" + " discretion is the greater part of valor" + " the early bird catches the worm" + " never look a gift horse in the mouth" + " you cannot make an omelet without breaking a few eggs" + " you cannot always get what you want" + " cleanliness is next to godliness" + " a watched pot never boils" + " beggars cannot be choosers" + " actions speak louder than words" + " if it is not broke, do not fix it" + " practice makes perfect" + " too many cooks spoil the broth" + " easy come easy go" + " do not bite the hand that feeds you" + " all good things must come to an end" + " if you cannot beat them, join them" + " there is no time like the present" + " beauty is in the eye of the beholder" + " necessity is the mother of invention" + " a penny saved is a penny earned" + " familiarity breeds contempt" + " you cannot judge a book by its cover" + " good things come to those who wait" + " do not put all your eggs in one basket" + " two heads are better than one" + " the grass is always greener on the other side of the hill" + " do unto others as you would have them do unto you" + " a chain is only as strong as its weakest link" + " honesty is the best policy" + " absence makes the heart grow fonder" + " you can lead a horse to water but you cannot make him drink" + " do not count your chickens before they hatch" + " if you want something done right you have to do it yourself").split(" "); private static final int NUMBER_OF_SEGMENTS = 4; private static String[] words(int count) { String[] words = new String[count]; for (int w = 0; w < count; w++) { words[w] = WORDS[RANDOM.nextInt(WORDS.length)]; } return words; } public static void main(String[] argv) { Directory directory = new RAMDirectory(); try { IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_32, new WhitespaceAnalyzer(Version.LUCENE_32))); int done = 0; for (int i = 0; i < NUMBER_OF_SEGMENTS; i++) { int remaining = NUMBER_OF_SEGMENTS - i; int numberOfDocs; if (remaining == 1) { numberOfDocs = TOTAL_DOCS - done; } else { numberOfDocs = RANDOM.nextInt(TOTAL_DOCS - done - remaining) + 1; } done += numberOfDocs; System.out.println("Segment #" + i + " has " + numberOfDocs + " docs"); for (int d = 0; d < numberOfDocs; d++) { int wordCount = RANDOM.nextInt(WORDS_PER_DOC_DEVIATION * 2) + AVERAGE_WORDS_PER_DOC - WORDS_PER_DOC_DEVIATION; Document doc = new Document(); doc.add(new Field("f", Joiner.on(' ').join(words(wordCount)), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("second", RANDOM.nextInt(100) < SECOND_FIELD_MATCH_PERCENTAGE ? "yes" : "no", Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(doc); } writer.commit(); } writer.close(); IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); String[][] queries = new String[TOTAL_QUERIES][]; Term[][] terms = new Term[TOTAL_QUERIES][]; for (int q = 0; q < TOTAL_QUERIES; q++) { queries[q] = words(WORDS_PER_QUERY[RANDOM.nextInt(WORDS_PER_QUERY.length)]); terms[q] = new Term[queries[q].length]; for (int qw = 0; qw < queries[q].length; qw++) { terms[q][qw] = new Term(FIELD, queries[q][qw]); } } // Warm up. new PhraseFilter(FIELD, queries[0]).getDocIdSet(reader); for (int round = 0; round < ROUNDS; round++) { System.out.println(); String name1 = "filter"; String name2 = "query"; long ms1 = 0, ms2 = 0; for (int step = 0; step < 2; step++) { System.gc(); System.gc(); System.gc(); if (step == (round & 1)) { long millis = System.currentTimeMillis(); long hits = 0; for (String[] queryWords : queries) { PhraseFilter pf = new PhraseFilter( new FilterIntersectionProvider(TermsFilter.from(new Term("second", "yes"))), FIELD, queryWords); hits += searcher.search(new FilteredQuery(new MatchAllDocsQuery(), pf), 1).totalHits; } ms1 = System.currentTimeMillis() - millis; System.out.println("Finished " + name1 + " in " + ms1 + "ms with " + hits + " hits"); } else { long millis = System.currentTimeMillis(); long hits = 0; for (Term[] queryTerms : terms) { PhraseQuery pq = new PhraseQuery(); for (Term term : queryTerms) { pq.add(term); } Query query = BooleanQueryBuilder.builder() .must(new TermQuery(new Term("second", "yes"))).must(pq).build(); hits += searcher.search(query, 1).totalHits; } ms2 = System.currentTimeMillis() - millis; System.out.println("Finished " + name2 + " in " + ms2 + "ms with " + hits + " hits"); } } System.out.println(name1 + " took " + (int) ((100.0 * ms1) / ms2) + "% as much time as " + name2); } } catch (IOException e) { e.printStackTrace(); } } }