com.tinkerpop.graph.benchmark.index.LuceneKeyToNodeIdIndexImpl.java Source code

Java tutorial

Introduction

Here is the source code for com.tinkerpop.graph.benchmark.index.LuceneKeyToNodeIdIndexImpl.java

Source

package com.tinkerpop.graph.benchmark.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
import java.io.File;
import java.util.HashMap;
import java.util.LinkedHashMap;

import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.OpenBitSet;

/**
 * Uses Lucene to store and retrieve vertex Ids keyed on a user-defined key. Uses: 1) A bloom filter
 * "to know what we don't know" 2) An LRU cache to remember commonly accessed values we do know 3)
 * Uses a buffer to accumulate uncommitted state Lucene takes care of the rest.
 * @author Mark
 */
public class LuceneKeyToNodeIdIndexImpl implements SimpleKeyToNodeIdIndex {
    LRUCache<String, Long> hotCache;
    HashMap<String, Long> uncommittedKeyBuffer = new HashMap<String, Long>();
    private IndexWriter writer;
    private FSDirectory dir;
    private IndexReader reader;
    Term term = new Term("udk", "");
    int bloomFilterSize = 50 * 1024 * 1024;
    int maxNumRecordsBeforeCommit = 500000;
    private int lruCacheSize = 500000;
    OpenBitSet bloomFilter;
    //Avoid Lucene performing "mega merges" with a finite limit on segments sizes that can be merged
    private int maxMergeMB = 3000;
    //Stats for each batch of updates
    long bloomReadSaves = 0;
    long hotCacheHits = 0;
    long luceneAdds = 0;
    long failedLuceneReads = 0;
    long successfulLuceneReads = 0;
    long startTime = System.currentTimeMillis();
    private IndexReader[] subreaders;
    private boolean showDebug;
    private String path;
    private boolean useCompoundFile = false;
    private double ramBufferSizeMB = 300;
    private int termIndexIntervalSize = 512;

    public LuceneKeyToNodeIdIndexImpl(String path, boolean showDebugInfo) {
        this.showDebug = showDebugInfo;
        this.path = path;
    }

    public void init() {
        try {
            hotCache = new LRUCache<String, Long>(getLruCacheSize());
            if (showDebug) {
                System.out.println(
                        "timeTaken,numDocs,bloomReadSaves,hotCacheHits,failedLuceneReads,successfulLuceneReads,luceneAdds");
            }
            bloomFilter = new OpenBitSet(bloomFilterSize);
            deleteDirectoryContents(new File(path));
            dir = FSDirectory.open(new File(path));
            writer = new IndexWriter(dir, new WhitespaceAnalyzer(), MaxFieldLength.UNLIMITED);
            //         LogByteSizeMergePolicy mp=new LogByteSizeMergePolicy(writer);
            LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy();
            mp.setMaxMergeMB(getMaxMergeMB());
            writer.setMergePolicy(mp);
            writer.setUseCompoundFile(useCompoundFile);
            writer.setRAMBufferSizeMB(ramBufferSizeMB);
            writer.setTermIndexInterval(termIndexIntervalSize);
            writer.commit();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    static public void deleteDirectoryContents(File path) {
        if (path.exists()) {
            File[] files = path.listFiles();
            for (int i = 0; i < files.length; i++) {
                if (!files[i].isDirectory()) {
                    files[i].delete();
                }
            }
        }
    }

    @Override
    public long getGraphNodeId(String udk) {
        Long result = hotCache.get(udk);
        if (result == null) {
            //fail fast on bloom
            int bloomKey = Math.abs(udk.hashCode() % bloomFilterSize);
            if (!bloomFilter.fastGet(bloomKey)) {
                //Not seen - fail
                bloomReadSaves++;
                return -1;
            }
            result = uncommittedKeyBuffer.get(udk);
            if (result != null) {
                return result;
            }
            if (reader == null) {
                try {
                    reader = IndexReader.open(dir, true);
                    subreaders = reader.getSequentialSubReaders();
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            }
            try {
                Term searchTerm = term.createTerm(udk);
                for (IndexReader r : subreaders) {
                    TermDocs td = r.termDocs(searchTerm);
                    if (td.next()) {
                        Document doc = r.document(td.doc());
                        result = Long.parseLong(doc.get("id"));
                        hotCache.put(udk, result);
                        successfulLuceneReads++;
                        return result;
                    }
                }
                failedLuceneReads++;
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
        } else {
            hotCacheHits++;
        }
        if (result == null) {
            return -1;
        } else {
            return result;
        }
    }

    @Override
    public void put(String udk, long graphNodeId) {
        try {
            if (uncommittedKeyBuffer.size() > maxNumRecordsBeforeCommit) {
                writer.commit();
                if (reader != null) {
                    IndexReader newReader = reader.reopen(true);
                    if (newReader != reader) {
                        reader.close();
                        reader = newReader;
                    }
                    subreaders = reader.getSequentialSubReaders();
                }
                uncommittedKeyBuffer.clear();
                if (showDebug) {
                    long diff = System.currentTimeMillis() - startTime;
                    System.out.println(diff + "," + reader.maxDoc() + "," + bloomReadSaves + "," + hotCacheHits
                            + "," + failedLuceneReads + "," + successfulLuceneReads + "," + luceneAdds);
                }
                bloomReadSaves = 0;
                hotCacheHits = 0;
                failedLuceneReads = 0;
                luceneAdds = 0;
                successfulLuceneReads = 0;
                startTime = System.currentTimeMillis();
            }
            int bloomKey = Math.abs(udk.hashCode() % bloomFilterSize);
            bloomFilter.fastSet(bloomKey);
            uncommittedKeyBuffer.put(udk, graphNodeId);
            Document doc = new Document();
            Field udkF = new Field("udk", udk, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
            udkF.setOmitTermFreqAndPositions(true);
            doc.add(udkF);
            doc.add(new Field("id", "" + graphNodeId, Field.Store.YES, Field.Index.NO));
            writer.addDocument(doc);
            luceneAdds++;
        } catch (Exception e) {
            throw new RuntimeException("Error adding key to index", e);
        }
    }

    @Override
    public void close() {
        try {
            reader.close();
            writer.close();
            dir.close();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    @SuppressWarnings("serial")
    static class LRUCache<K, V> extends LinkedHashMap<K, V> {
        private int maxSize;

        public LRUCache(int maxSize) {
            super(((maxSize * 4) / 3) + 1, 0.75f, true);
            this.maxSize = maxSize;
        }

        @Override
        protected boolean removeEldestEntry(java.util.Map.Entry<K, V> eldest) {
            return size() > maxSize;
        }
    }

    public boolean isUseCompoundFile() {
        return useCompoundFile;
    }

    public void setUseCompoundFile(boolean useCompoundFile) {
        this.useCompoundFile = useCompoundFile;
    }

    public double getRamBufferSizeMB() {
        return ramBufferSizeMB;
    }

    public void setRamBufferSizeMB(double ramBufferSizeMB) {
        this.ramBufferSizeMB = ramBufferSizeMB;
    }

    public int getTermIndexIntervalSize() {
        return termIndexIntervalSize;
    }

    public void setTermIndexIntervalSize(int termIndexIntervalSize) {
        this.termIndexIntervalSize = termIndexIntervalSize;
    }

    public int getBloomFilterSize() {
        return bloomFilterSize;
    }

    public void setBloomFilterSize(int bloomFilterSize) {
        this.bloomFilterSize = bloomFilterSize;
    }

    public int getMaxNumRecordsBeforeCommit() {
        return maxNumRecordsBeforeCommit;
    }

    public void setMaxNumRecordsBeforeCommit(int maxNumRecordsBeforeCommit) {
        this.maxNumRecordsBeforeCommit = maxNumRecordsBeforeCommit;
    }

    public void setLruCacheSize(int lruCacheSize) {
        this.lruCacheSize = lruCacheSize;
    }

    public int getLruCacheSize() {
        return lruCacheSize;
    }

    public void setMaxMergeMB(int maxMergeMB) {
        this.maxMergeMB = maxMergeMB;
    }

    public int getMaxMergeMB() {
        return maxMergeMB;
    }

    public long getBloomReadSaves() {
        return bloomReadSaves;
    }

    public long getHotCacheHits() {
        return hotCacheHits;
    }

    public long getLuceneAdds() {
        return luceneAdds;
    }

    public long getFailedLuceneReads() {
        return failedLuceneReads;
    }

    public long getSuccessfulLuceneReads() {
        return successfulLuceneReads;
    }
}