org.meresco.lucene.numerate.UriEnumerate.java Source code

Java tutorial

Introduction

Here is the source code for org.meresco.lucene.numerate.UriEnumerate.java

Source

/* begin license *
 *
 * "Meresco Uri Enumerate" contains an http server which maps uris to integer numbers
 *
 * Copyright (C) 2016 Seecr (Seek You Too B.V.) http://seecr.nl
 *
 * This file is part of "Meresco Uri Enumerate"
 *
 * "Meresco Uri Enumerate" is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * "Meresco Uri Enumerate" is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with "Meresco Uri Enumerate"; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * end license */

package org.meresco.lucene.numerate;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.file.FileSystems;
import java.nio.file.Path;

import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
import org.apache.lucene.codecs.lucene60.Lucene60Codec;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.store.FSDirectory;
import org.meresco.lucene.numerate.SimpleSearcher.Hit;

public class UriEnumerate {
    private static final int CACHE_SIZE = 1 * 1000;
    static final String URI_FIELD = "uri_index";
    static final String ORD_INDEX_FIELD = "ord_index";
    static final String ORD_VALUE_FIELD = "ord_value";

    private SimpleSearcher searcher;
    private IndexWriter writer;
    private UriMapDocument doc_proto = new UriMapDocument();
    Cache cache;
    private int next_ord;
    private TransactionLog transactionLog;

    /**
     *
     * @param path
     * @param max_cache_size
     * @param withTransactionLog allows for crash recovery, but slows down UriNumerate considerably because of file system flush.
     * @throws IOException
     */
    public UriEnumerate(String path, int max_cache_size, boolean withTransactionLog) throws IOException {
        IndexWriterConfig config = new IndexWriterConfig(null);
        ConcurrentMergeScheduler ms = (ConcurrentMergeScheduler) config.getMergeScheduler();
        ms.setDefaultMaxMergesAndThreads(/* spins= */false);
        LogDocMergePolicy mp = new LogDocMergePolicy();
        mp.setMergeFactor(2);
        mp.setMinMergeDocs(max_cache_size);
        config.setMergePolicy(mp);
        config.setCodec(new Lucene60Codec() {
            @Override
            public PostingsFormat getPostingsFormatForField(String field) {
                return new BloomFilteringPostingsFormat(super.getPostingsFormatForField(field));
            }
        });
        config.setUseCompoundFile(false);
        this.writer = new IndexWriter(FSDirectory.open(FileSystems.getDefault().getPath(path)), config);
        this.next_ord = writer.numDocs() + 1;
        this.searcher = new SimpleSearcher(this.writer);
        this.cache = new Cache(max_cache_size, () -> this.commit());
        this.transactionLog = new TransactionLog(withTransactionLog ? path + "/transactionLog" : null);
        this.transactionLog.maybeRecover();
    }

    public UriEnumerate(String path, int cache_size) throws IOException {
        this(path, cache_size, /* writeTransactionLog */ true);
    }

    public UriEnumerate(String path) throws IOException {
        this(path, CACHE_SIZE);
    }

    public UriEnumerate(Path path, int cache_size) throws IOException {
        this(path.toString(), cache_size);
    }

    public synchronized int put(String uri) throws Exception {
        int ord = this.get(uri);
        if (ord < 0) {
            ord = this.add(uri);
        }
        return ord;
    }

    public int get(String uri) throws Exception {
        Integer ord = this.cache.get(uri);
        if (ord != null) {
            return ord;
        }
        ord = this.searcher.search(uri);
        if (ord >= 0) {
            this.cache.put(uri, ord);
        }
        return ord;
    }

    public String get(int ord) throws Exception {
        // TODO there is no ord->uri cache yet, so we simple ALWAYS flush and
        // search with reOpen
        this.searcher.reOpen();
        Hit hit = this.searcher.search(ORD_INDEX_FIELD, OrdField.ord2bytes(ord));
        if (hit != null) {
            return hit.reader.document(hit.doc).get(URI_FIELD);
        }
        return null;
    }

    private int add(String uri) throws Exception {
        int ord = this.next_ord++;
        this.transactionLog.write(uri, ord);
        add(uri, ord);
        return ord;
    }

    void add(String uri, int ord) throws IOException {
        this.doc_proto.set(uri, ord);
        writer.addDocument(this.doc_proto.doc);
        if (this.cache.put(uri, ord) != null) {
            throw new RuntimeException("Why did this happen? No unit-test throws me!");
        }
    }

    public void close() throws IOException {
        this.commit();
        this.writer.close();
        this.writer = null;
        this.searcher.close();
        this.searcher = null;
        this.transactionLog.close();
        this.transactionLog = null;
    }

    public void commit() {
        try {
            this.writer.commit();
            this.transactionLog.erase();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        this.searcher.reOpen();
    }

    public int size() {
        return this.writer.numDocs();
    }

    public void updateNextOrd(int minimalNextOrd) {
        this.next_ord = Math.max(this.next_ord, minimalNextOrd);
    }

    class TransactionLog {
        String path;
        PrintWriter out;

        public TransactionLog(String transactionLogPath) throws IOException {
            this.path = transactionLogPath;
        }

        public void write(String uri, int ord) {
            if (this.path == null) {
                return;
            }
            if (this.out == null) {
                this.startNew();
            }
            this.out.println("" + ord + " " + uri);
            this.out.flush();
        }

        private void startNew() {
            try {
                this.out = new PrintWriter(
                        new BufferedWriter(new OutputStreamWriter(new FileOutputStream(this.path), "UTF-8")));
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }

        public void close() {
            if (this.out != null) {
                this.out.close();
                this.out = null;
            }
        }

        public void erase() {
            this.close();
            if (this.path != null && new File(this.path).exists()) {
                new File(this.path).delete();
            }
        }

        public void maybeRecover() throws IOException {
            if (this.path == null || !new File(this.path).exists()) {
                return;
            }
            int ord = 0;
            BufferedReader reader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(this.path), "UTF-8"));
            String line;
            while ((line = reader.readLine()) != null) {
                String[] parts = line.split("\\s", 2);
                try {
                    String uri = parts[1];
                    ord = Integer.valueOf(parts[0]);
                    UriEnumerate.this.add(uri, ord);
                } catch (ArrayIndexOutOfBoundsException e) {
                    break;
                }
            }
            UriEnumerate.this.updateNextOrd(ord + 1);
            reader.close();
        }
    }
}