edu.unika.aifb.graphindex.storage.lucene.LuceneTest.java Source code

Introduction

Here is the source code for edu.unika.aifb.graphindex.storage.lucene.LuceneTest.java
Source

package edu.unika.aifb.graphindex.storage.lucene;

/**
 * Copyright (C) 2009 Gnter Ladwig (gla at aifb.uni-karlsruhe.de)
 * 
 * This file is part of the graphindex project.
 *
 * graphindex is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2
 * as published by the Free Software Foundation.
 * 
 * graphindex is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with graphindex.  If not, see <http://www.gnu.org/licenses/>.
 */

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;

import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.LockObtainFailedException;

import edu.unika.aifb.graphindex.data.Table;
import edu.unika.aifb.graphindex.storage.StorageException;
import edu.unika.aifb.graphindex.util.StringSplitter;
import edu.unika.aifb.graphindex.util.Util;

public class LuceneTest {

    private static IndexSearcher m_searcher;
    private static IndexReader m_reader;

    private interface DocumentIdCollector {
        public void collect(int docId);
    }

    private static void search(Query q, final DocumentIdCollector collector) throws StorageException {
        try {
            m_searcher.search(q, new HitCollector() {
                public void collect(int docId, float score) {
                    collector.collect(docId);
                }
            });
        } catch (IOException e) {
            throw new StorageException(e);
        }
    }

    public static List<Integer> getDocumentIds(Query q) throws StorageException {
        final List<Integer> docIds = new ArrayList<Integer>();
        try {
            m_searcher.search(q, new HitCollector() {
                public void collect(int docId, float score) {
                    docIds.add(docId);
                }
            });
        } catch (IOException e) {
            throw new StorageException(e);
        }
        //      log.debug("  " + docIds.size() + " docs");

        //      Collections.sort(docIds);

        return docIds;
    }

    private static void addToTable(Table<String> table, Document doc, String allowedSubject) {
        String[] subjectStrings;
        String subjectString;
        subjectString = doc.getField("subject").stringValue();

        StringSplitter splitter = new StringSplitter(subjectString, "\n");

        String object = doc.getField("object").stringValue();

        //      for (String s : subjectStrings) {
        //         String[] t = s.split("\t");
        //         table.addRow(new String [] { t[0], object });
        //      }
        //      log.debug(object);
        //      for (String s : subjectStrings) {
        String s;
        while ((s = splitter.next()) != null) {
            //         log.debug(s);
            //         if (allowedSubject != null)
            //            log.debug(s);
            if (allowedSubject == null || s.equals(allowedSubject))
                table.addRow(new String[] { s, object });
        }
    }

    private static Document getDocument(int docId) throws StorageException {
        try {
            Document doc = m_reader.document(docId);
            return doc;
        } catch (CorruptIndexException e) {
            throw new StorageException(e);
        } catch (IOException e) {
            throw new StorageException(e);
        }
    }

    public static void test1(List<Query> queries) throws StorageException {
        System.out.println("test1: retrieve doc ids and then load docs for each query");
        for (Query q : queries) {
            System.out.print(q + " ");
            Table<String> table = new Table<String>("source", "target");
            long start = System.currentTimeMillis();
            List<Integer> docIds = getDocumentIds(q);
            System.out.print(System.currentTimeMillis() - start + " ");

            start = System.currentTimeMillis();
            for (int id : docIds) {
                Document doc = getDocument(id);
                //            addToTable(table, doc, null);
            }
            System.out.print(System.currentTimeMillis() - start + " ");
            System.out.print(docIds.size() + " ");
            System.out.println(table.rowCount());
        }
    }

    private static class Test1a implements Callable<Table<String>> {
        private Query q;

        public Test1a(Query q) {
            this.q = q;
        }

        public Table<String> call() throws Exception {
            System.out.println(q + " ");
            Table<String> table = new Table<String>("source", "target");
            long start = System.currentTimeMillis();
            List<Integer> docIds = getDocumentIds(q);
            //         System.out.print(System.currentTimeMillis() - start + " ");

            start = System.currentTimeMillis();
            for (int id : docIds) {
                Document doc = getDocument(id);
                addToTable(table, doc, null);
            }
            //         System.out.print(System.currentTimeMillis() - start + " ");
            //         System.out.print(docIds.size() + " ");
            //         System.out.println(table.rowCount());
            return table;
        }
    }

    public static void test1a(List<Query> queries)
            throws StorageException, InterruptedException, ExecutionException {
        System.out.println("test1a: retrieve doc ids and then load docs for each query with multiple threads");
        ExecutorService executor = Executors.newFixedThreadPool(1);
        ExecutorCompletionService<Table<String>> cs = new ExecutorCompletionService<Table<String>>(executor);
        for (Query q : queries) {
            cs.submit(new Test1a(q));
        }

        for (int i = 0; i < queries.size(); i++) {
            Table<String> table = cs.take().get();
        }
        executor.shutdown();
    }

    public static void test2(List<Query> queries) throws StorageException {
        System.out.println("test2: retrieve doc ids, load each doc");
        for (Query q : queries) {
            System.out.print(q + " ");
            final Table<String> table = new Table<String>("source", "target");
            final long start = System.currentTimeMillis();
            search(q, new DocumentIdCollector() {
                public void collect(int docId) {
                    Document doc;
                    try {
                        doc = getDocument(docId);
                        addToTable(table, doc, null);
                    } catch (StorageException e) {
                    }
                }
            });
            System.out.println(" " + table.rowCount());
        }
    }

    private static class Test3 implements Callable<Table<String>> {

        private List<Integer> m_docIds;

        public Test3(List<Integer> docIds) {
            m_docIds = docIds;
        }

        public Table<String> call() throws Exception {
            Table<String> table = new Table<String>("source", "target");
            for (int id : m_docIds) {
                Document doc = getDocument(id);
                addToTable(table, doc, null);
            }
            System.out.println(table.rowCount());
            return table;
        }

    }

    public static void test3(List<Query> queries)
            throws StorageException, InterruptedException, ExecutionException {
        System.out.println("test3: retrieve doc ids, separate thread for loading docs for each query");
        ExecutorService executor = Executors.newFixedThreadPool(1);
        ExecutorCompletionService<Table<String>> cs = new ExecutorCompletionService<Table<String>>(executor);
        for (Query q : queries) {
            System.out.print(q + " ");
            long start = System.currentTimeMillis();
            List<Integer> docIds = getDocumentIds(q);
            System.out.print(docIds.size() + " ");
            System.out.println(System.currentTimeMillis() - start);

            cs.submit(new Test3(docIds));
        }

        for (int i = 0; i < queries.size(); i++) {
            Table<String> res = cs.take().get();
        }
        executor.shutdown();
    }

    public static void test4(List<Query> queries) throws StorageException {
        System.out.println("test4: retrieve doc ids for all queries, then load");
        List<Integer> docIds = new ArrayList<Integer>();
        for (Query q : queries) {
            System.out.println(q);
            docIds.addAll(getDocumentIds(q));

        }
        Collections.sort(docIds);
        System.out.println(docIds.size());
        for (int id : docIds) {
            Document doc = getDocument(id);
            //         addToTable(table, doc, null);
        }
    }

    public static class Test5 implements Runnable {
        private BlockingQueue<Integer> m_bc;
        private List<Document> m_docs = new LinkedList<Document>();
        private int m_size = -1;

        public Test5(BlockingQueue<Integer> bc) {
            m_bc = bc;
        }

        public List<Document> getDocuments() {
            return m_docs;
        }

        synchronized public void setSize(int size) {
            m_size = size;
        }

        public void run() {
            try {
                int x = 0;
                while (m_size == -1 || x < m_size) {
                    int docId = m_bc.take();
                    m_docs.add(getDocument(docId));
                    x++;
                }
            } catch (InterruptedException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            } catch (StorageException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

    public static void test5(List<Query> queries)
            throws StorageException, InterruptedException, ExecutionException {
        System.out.println("test5: separate thread for loading docs, using blockingqueue for doc ids");
        BlockingQueue<Integer> bc = new LinkedBlockingQueue<Integer>();
        Test5 t5 = new Test5(bc);

        Thread t = new Thread(t5);
        t.start();

        int x = 0;
        for (Query q : queries) {
            System.out.print(q + " ");
            long start = System.currentTimeMillis();
            List<Integer> docIds = getDocumentIds(q);
            x += docIds.size();
            System.out.print(docIds.size() + " ");
            System.out.println(System.currentTimeMillis() - start);

            bc.addAll(docIds);
        }
        t5.setSize(x);
        t.join();

        System.out.println(t5.getDocuments().size());
    }

    public static void test5a(List<Query> queries)
            throws StorageException, InterruptedException, ExecutionException {
        System.out.println("test5a: separate thread for loading docs, using blockingqueue for doc ids");
        final BlockingQueue<Integer> bc = new LinkedBlockingQueue<Integer>();
        Test5 t5 = new Test5(bc);

        Thread t = new Thread(t5);
        t.start();

        final Util.Counter c = new Util.Counter();
        for (Query q : queries) {
            System.out.println(q + " ");
            long start = System.currentTimeMillis();
            search(q, new DocumentIdCollector() {

                public void collect(int docId) {
                    bc.add(docId);
                    c.val++;
                }

            });
        }
        t5.setSize(c.val);
        t.join();

        System.out.println(t5.getDocuments().size());
    }

    public static void main(String[] args) throws CorruptIndexException, LockObtainFailedException, IOException,
            StorageException, InterruptedException, ExecutionException {
        m_reader = IndexReader.open("/Users/gl/Studium/diplomarbeit/workspace/graphindex/output/lubm50/index");
        m_searcher = new IndexSearcher(m_reader);
        BooleanQuery.setMaxClauseCount(2000000);
        List<Query> queries = new ArrayList<Query>();
        BufferedReader in = new BufferedReader(
                new FileReader("/Users/gl/Studium/diplomarbeit/graphindex evaluation/lq7storagequeries.txt"));
        String input;
        while ((input = in.readLine()) != null) {
            queries.add(input.indexOf("*") < 0 ? new TermQuery(new Term("ext", input))
                    : new PrefixQuery(new Term("ext", input.replaceAll("\\*", ""))));
        }

        long start = System.currentTimeMillis();
        //      test2(queries);
        //      System.out.println("t: " + (System.currentTimeMillis() - start));
        //
        //      start = System.currentTimeMillis();
        //      test3(queries);
        //      System.out.println("t: " + (System.currentTimeMillis() - start));
        //      
        //      start = System.currentTimeMillis();
        //      test4(queries);
        //      System.out.println("t: " + (System.currentTimeMillis() - start));

        start = System.currentTimeMillis();
        test1(queries);
        System.out.println("t: " + (System.currentTimeMillis() - start));

        start = System.currentTimeMillis();
        test1a(queries);
        System.out.println("t: " + (System.currentTimeMillis() - start));

        start = System.currentTimeMillis();
        test5(queries);
        System.out.println("t: " + (System.currentTimeMillis() - start));

        start = System.currentTimeMillis();
        test5a(queries);
        System.out.println("t: " + (System.currentTimeMillis() - start));
    }
}