com.greplin.lucene.filter.PhraseFilterBenchmark.java Source code

Java tutorial

Introduction

Here is the source code for com.greplin.lucene.filter.PhraseFilterBenchmark.java

Source

/*
 * Copyright 2013 The greplin-lucene-utils Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.greplin.lucene.filter;

import com.google.common.base.Joiner;
import com.greplin.lucene.query.BooleanQueryBuilder;
import com.greplin.lucene.util.FilterIntersectionProvider;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.FilteredQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

import java.io.IOException;
import java.util.Random;

/**
 * Random benchmark for the phrase filter.
 */
public class PhraseFilterBenchmark {

    private static final Random RANDOM = new Random();

    private static final int TOTAL_DOCS = 10000;

    private static final int AVERAGE_WORDS_PER_DOC = 100;

    private static final int WORDS_PER_DOC_DEVIATION = 95;

    private static final int SECOND_FIELD_MATCH_PERCENTAGE = 50;

    private static final int ROUNDS = 4;

    private static final int TOTAL_QUERIES = 5000;

    private static final int[] WORDS_PER_QUERY = { 2, 2, 2, 2, 3, 3, 4, 5 };

    private static final String FIELD = "f";

    private static final String[] WORDS = ("the quick brown fox jumps over the lazy dog"
            + " a stitch in time saves nine" + " an apple a day keeps the doctor away"
            + " two wrongs do not make a right" + " the pen is mightier than the sword"
            + " the squeaky wheel gets the grease" + " no man is an island" + " fortune favors the bold"
            + " people who live in glass houses should not throw stones" + " better late than never"
            + " hope for the best but prepare for the worst" + " birds of a feather flock together"
            + " keep your friends close and your enemies closer" + " a picture is worth a thousand words"
            + " there is no such thing as a free lunch" + " there is no place like home"
            + " discretion is the greater part of valor" + " the early bird catches the worm"
            + " never look a gift horse in the mouth" + " you cannot make an omelet without breaking a few eggs"
            + " you cannot always get what you want" + " cleanliness is next to godliness"
            + " a watched pot never boils" + " beggars cannot be choosers" + " actions speak louder than words"
            + " if it is not broke, do not fix it" + " practice makes perfect" + " too many cooks spoil the broth"
            + " easy come easy go" + " do not bite the hand that feeds you" + " all good things must come to an end"
            + " if you cannot beat them, join them" + " there is no time like the present"
            + " beauty is in the eye of the beholder" + " necessity is the mother of invention"
            + " a penny saved is a penny earned" + " familiarity breeds contempt"
            + " you cannot judge a book by its cover" + " good things come to those who wait"
            + " do not put all your eggs in one basket" + " two heads are better than one"
            + " the grass is always greener on the other side of the hill"
            + " do unto others as you would have them do unto you"
            + " a chain is only as strong as its weakest link" + " honesty is the best policy"
            + " absence makes the heart grow fonder"
            + " you can lead a horse to water but you cannot make him drink"
            + " do not count your chickens before they hatch"
            + " if you want something done right you have to do it yourself").split(" ");

    private static final int NUMBER_OF_SEGMENTS = 4;

    private static String[] words(int count) {
        String[] words = new String[count];
        for (int w = 0; w < count; w++) {
            words[w] = WORDS[RANDOM.nextInt(WORDS.length)];
        }
        return words;
    }

    public static void main(String[] argv) {
        Directory directory = new RAMDirectory();
        try {
            IndexWriter writer = new IndexWriter(directory,
                    new IndexWriterConfig(Version.LUCENE_32, new WhitespaceAnalyzer(Version.LUCENE_32)));
            int done = 0;
            for (int i = 0; i < NUMBER_OF_SEGMENTS; i++) {
                int remaining = NUMBER_OF_SEGMENTS - i;
                int numberOfDocs;
                if (remaining == 1) {
                    numberOfDocs = TOTAL_DOCS - done;
                } else {
                    numberOfDocs = RANDOM.nextInt(TOTAL_DOCS - done - remaining) + 1;
                }
                done += numberOfDocs;
                System.out.println("Segment #" + i + " has " + numberOfDocs + " docs");

                for (int d = 0; d < numberOfDocs; d++) {
                    int wordCount = RANDOM.nextInt(WORDS_PER_DOC_DEVIATION * 2) + AVERAGE_WORDS_PER_DOC
                            - WORDS_PER_DOC_DEVIATION;
                    Document doc = new Document();
                    doc.add(new Field("f", Joiner.on(' ').join(words(wordCount)), Field.Store.YES,
                            Field.Index.ANALYZED));
                    doc.add(new Field("second", RANDOM.nextInt(100) < SECOND_FIELD_MATCH_PERCENTAGE ? "yes" : "no",
                            Field.Store.NO, Field.Index.ANALYZED));
                    writer.addDocument(doc);
                }
                writer.commit();
            }
            writer.close();

            IndexReader reader = IndexReader.open(directory);
            IndexSearcher searcher = new IndexSearcher(reader);

            String[][] queries = new String[TOTAL_QUERIES][];
            Term[][] terms = new Term[TOTAL_QUERIES][];

            for (int q = 0; q < TOTAL_QUERIES; q++) {
                queries[q] = words(WORDS_PER_QUERY[RANDOM.nextInt(WORDS_PER_QUERY.length)]);
                terms[q] = new Term[queries[q].length];
                for (int qw = 0; qw < queries[q].length; qw++) {
                    terms[q][qw] = new Term(FIELD, queries[q][qw]);
                }
            }

            // Warm up.
            new PhraseFilter(FIELD, queries[0]).getDocIdSet(reader);

            for (int round = 0; round < ROUNDS; round++) {
                System.out.println();
                String name1 = "filter";
                String name2 = "query";

                long ms1 = 0, ms2 = 0;
                for (int step = 0; step < 2; step++) {
                    System.gc();
                    System.gc();
                    System.gc();

                    if (step == (round & 1)) {
                        long millis = System.currentTimeMillis();
                        long hits = 0;
                        for (String[] queryWords : queries) {
                            PhraseFilter pf = new PhraseFilter(
                                    new FilterIntersectionProvider(TermsFilter.from(new Term("second", "yes"))),
                                    FIELD, queryWords);
                            hits += searcher.search(new FilteredQuery(new MatchAllDocsQuery(), pf), 1).totalHits;
                        }
                        ms1 = System.currentTimeMillis() - millis;
                        System.out.println("Finished " + name1 + " in " + ms1 + "ms with " + hits + " hits");
                    } else {
                        long millis = System.currentTimeMillis();
                        long hits = 0;
                        for (Term[] queryTerms : terms) {
                            PhraseQuery pq = new PhraseQuery();
                            for (Term term : queryTerms) {
                                pq.add(term);
                            }
                            Query query = BooleanQueryBuilder.builder()
                                    .must(new TermQuery(new Term("second", "yes"))).must(pq).build();
                            hits += searcher.search(query, 1).totalHits;
                        }
                        ms2 = System.currentTimeMillis() - millis;
                        System.out.println("Finished " + name2 + " in " + ms2 + "ms with " + hits + " hits");
                    }
                }
                System.out.println(name1 + " took " + (int) ((100.0 * ms1) / ms2) + "% as much time as " + name2);
            }

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

}