Java tutorial
package de.jetsli.lumeo; /** * Licensed to the Apache Software Foundation (ASF) under one or more contributor license * agreements. See the NOTICE file distributed with this work for additional information regarding * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance with the License. You may obtain a * copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ import java.io.File; import java.io.IOException; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.lucene.document.Document; import org.apache.lucene.document.LongField; import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogByteSizeMergePolicy; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.NRTManager; import org.apache.lucene.search.NRTManager.TrackingIndexWriter; import org.apache.lucene.search.NRTManagerReopenThread; import org.apache.lucene.search.SearcherFactory; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import de.jetsli.lumeo.util.IndexOp; import de.jetsli.lumeo.util.LuceneHelper; import de.jetsli.lumeo.util.Mapping; import de.jetsli.lumeo.util.SearchExecutor; import org.apache.lucene.document.FieldType; import org.apache.lucene.index.*; /** * Uses a buffer to accumulate uncommitted state. Should stay independent of Blueprints API. * * Minor impressions taken from * http://code.google.com/p/graphdb-load-tester/source/browse/trunk/src/com/tinkerpop/graph/benchmark/index/LuceneKeyToNodeIdIndexImpl.java * * -> still use batchBuffer to support realtime get and to later support versioning -> use near real * time reader, no need for commit -> no bloomfilter then it is 1 sec (>10%) faster for testIndexing * and less memory usage TODO check if traversal benchmark is also faster * * @author Peter Karich, info@jetsli.de */ public class RawLucene { // of type long, for more efficient storage of node references public static final String ID = "_id"; // of type String, can be defined by the user public static final String UID = "_uid"; // of type String public static final String TYPE = "_type"; public static final String EDGE_OUT = "_eout"; public static final String EDGE_IN = "_ein"; public static final String EDGE_LABEL = "_elabel"; public static final String VERTEX_OUT = "_vout"; public static final String VERTEX_IN = "_vin"; public static final Version VERSION = Version.LUCENE_40; private TrackingIndexWriter writer; private Directory dir; private NRTManager nrtManager; //Avoid Lucene performing "mega merges" with a finite limit on segments sizes that can be merged private int maxMergeMB = 3000; private volatile long luceneOperations = 0; private long failedLuceneReads = 0; private long successfulLuceneReads = 0; private double ramBufferSizeMB = 128; private int termIndexIntervalSize = 512; private final ReadWriteLock indexRWLock = new ReentrantReadWriteLock(); // id -> indexOp (create, update, delete) // we could group indexop and same type (same analyzer) to make indexing faster private final Map<Long, Map<Long, IndexOp>> realTimeCache = new ConcurrentHashMap<Long, Map<Long, IndexOp>>(); private Logger logger = LoggerFactory.getLogger(getClass()); private Map<String, Mapping> mappings = new ConcurrentHashMap<String, Mapping>(2); private Mapping defaultMapping = new Mapping("_default"); private String name; private boolean closed = false; private FlushThread flushThread; private NRTManagerReopenThread reopenThread; private volatile long latestGen = -1; // If there are waiting searchers how long should reopen takes? double incomingSearchesMaximumWaiting = 0.03; // If there are no waiting searchers reopen it less frequent. // This also controls how large the realtime cache can be. less frequent reopens => larger cache double ordinaryWaiting = 5.0; public RawLucene(String path) { try { // if indexing rate is lowish but reopen rate is highish // dir = new NRTCachingDirectory(FSDirectory.open(new File(path)), 5, 60); dir = FSDirectory.open(new File(path)); name = "fs:" + path + " " + dir.toString(); } catch (IOException ex) { throw new RuntimeException( "cannot open lucene directory located at " + path + " error:" + ex.getMessage()); } } public RawLucene(Directory directory) { dir = directory; name = "mem " + dir.toString(); } public RawLucene init() { indexLock(); try { if (closed) throw new IllegalStateException("Already closed"); if (writer != null) throw new IllegalStateException("Already initialized"); // release locks when started if (IndexWriter.isLocked(dir)) { logger.warn("index is locked + " + name + " -> releasing lock"); IndexWriter.unlock(dir); } IndexWriterConfig cfg = new IndexWriterConfig(VERSION, defaultMapping.getCombinedAnalyzer()); LogByteSizeMergePolicy mp = new LogByteSizeMergePolicy(); mp.setMaxMergeMB(getMaxMergeMB()); cfg.setRAMBufferSizeMB(ramBufferSizeMB); cfg.setTermIndexInterval(termIndexIntervalSize); cfg.setMergePolicy(mp); // TODO specify different formats for id fields etc // -> this breaks 16 of our tests!? Lucene Bug? // cfg.setCodec(new Lucene40Codec() { // // @Override public PostingsFormat getPostingsFormatForField(String field) { // return new Pulsing40PostingsFormat(); // } // }); // cfg.setMaxThreadStates(8); boolean create = !DirectoryReader.indexExists(dir); cfg.setOpenMode(create ? IndexWriterConfig.OpenMode.CREATE : IndexWriterConfig.OpenMode.APPEND); //wrap the writer with a tracking index writer writer = new TrackingIndexWriter(new IndexWriter(dir, cfg)); nrtManager = new NRTManager(writer, new SearcherFactory() { // @Override // public IndexSearcher newSearcher(IndexReader reader) throws IOException { // //TODO do some kind of warming here? // return new IndexSearcher(reader); // } }); getCurrentRTCache(latestGen); int priority = Math.min(Thread.currentThread().getPriority() + 2, Thread.MAX_PRIORITY); flushThread = new FlushThread("flush-thread"); flushThread.setPriority(priority); flushThread.setDaemon(true); flushThread.start(); reopenThread = new NRTManagerReopenThread(nrtManager, ordinaryWaiting, incomingSearchesMaximumWaiting); reopenThread.setName("NRT Reopen Thread"); reopenThread.setPriority(priority); reopenThread.setDaemon(true); reopenThread.start(); return this; } catch (Exception e) { throw new RuntimeException(e); } finally { indexUnlock(); } } long getId(Document doc) { return ((LongField) doc.getField(ID)).numericValue().longValue(); } public Document findById(final long id) { //Check cache IndexOp result = getCurrentRTCache(latestGen).get(id); if (result != null) { if (result.type == IndexOp.Type.DELETE) return null; return result.document; } return searchSomething(new SearchExecutor<Document>() { @Override public Document execute(IndexSearcher searcher) throws Exception { // TODO optimize via indexReader.termDocsEnum !? IndexReaderContext trc = searcher.getTopReaderContext(); AtomicReaderContext[] arc = trc.leaves(); for (int i = 0; i < arc.length; i++) { AtomicReader subreader = arc[i].reader(); DocsEnum docs = subreader.terms(UID).iterator(null).docs(subreader.getLiveDocs(), null, false); if (docs != null) { int docID = docs.nextDoc(); if (docID != DocsEnum.NO_MORE_DOCS) { return subreader.document(docID); } } } return null; } }); } public Document findByUserId(final String uId) { return searchSomething(new SearchExecutor<Document>() { @Override public Document execute(final IndexSearcher searcher) throws IOException { final BytesRef bytes = new BytesRef(uId); Document doc = null; //IndexReaderContext trc = searcher.getTopReaderContext(); //trc.children(); //TODO -MH search subreaders - share common subreader code in findByID? //Hopefully Lucene should bail after collecting our result of 1 TopDocs results = searcher.search(new TermQuery(new Term(UID, bytes)), 1); if (results.totalHits > 1) { throw new IllegalStateException("Document with " + UID + "=" + uId + " not the only one"); } if (results.totalHits == 1) { doc = searcher.document(results.scoreDocs[0].doc, null); } // new MyGather(searcher.getIndexReader()) { // // @Override protected boolean runLeaf(int base, AtomicReader leaf) throws IOException { // DocsEnum docs = leaf.termDocsEnum(leaf.getLiveDocs(), UID, bytes, false); // if (docs == null) // return true; // // int docID = docs.nextDoc(); // if (docID == DocsEnum.NO_MORE_DOCS) // return true; // // if (docs.nextDoc() != DocsEnum.NO_MORE_DOCS) // throw new IllegalStateException("Document with " + UID + "=" + uId + " not the only one"); // // doc = searcher.doc(base + docID); // return false; // } // }.run(); return doc; } }); } public <T> T searchSomething(SearchExecutor<T> exec) { IndexSearcher searcher = nrtManager.acquire(); try { return (T) exec.execute(searcher); } catch (Exception e) { throw new RuntimeException(e); } finally { try { nrtManager.release(searcher); } catch (IOException ex) { throw new RuntimeException(ex); } } } public boolean exists(long id) { return findById(id) != null; } public boolean existsUserId(String uId) { return findByUserId(uId) != null; } // not thread safe => only an estimation public int calcSize() { // TODO too many entries are reported int unflushedEntries = 0; for (Entry<Long, Map<Long, IndexOp>> e : realTimeCache.entrySet()) { if (latestGen >= e.getKey()) unflushedEntries = e.getValue().size(); } return unflushedEntries; } public void close() { indexLock(); try { flushThread.interrupt(); reopenThread.close(); flushThread.join(); // force correct count of calcSize // waitUntilSearchable(); // cleanUpCache(latestGen + 1, 0); closed = true; nrtManager.close(); try { waitUntilSearchable(); // writer.waitForMerges(); // writer.commit(); } catch (Exception ex) { logger.warn("Couldn't commit changes to writer", ex); writer.getIndexWriter().rollback(); } writer.getIndexWriter().close(); dir.close(); } catch (Exception e) { throw new RuntimeException(e); } finally { indexUnlock(); } } public Document createDocument(String uId, long id, Class cl) { Document doc = new Document(); Mapping m = getMapping(cl.getSimpleName()); doc.add(m.createField(RawLucene.TYPE, cl.getSimpleName())); doc.add(m.newUIdField(UID, uId)); doc.add(m.newIdField(ID, id)); return doc; } /** * Warning: Counts only docs already indexed - exclusive the realtime cache if not yet commited. */ long count(Class cl, final String fieldName, Object val) { Mapping m = getMapping(cl); final BytesRef bytes = m.toBytes(fieldName, val); return searchSomething(new SearchExecutor<Long>() { @Override public Long execute(IndexSearcher searcher) throws Exception { // TODO optimize via indexReader.termDocsEnum !? TopDocs td = searcher.search(new TermQuery(new Term(fieldName, bytes)), 1); return (long) td.totalHits; } }); } long removeById(final long id) { try { latestGen = writer.deleteDocuments(new Term(ID, LuceneHelper.newRefFromLong(id))); getCurrentRTCache(latestGen).put(id, new IndexOp(IndexOp.Type.DELETE)); return latestGen; } catch (Exception ex) { throw new RuntimeException(ex); } } public long fastPut(long id, Document newDoc) { try { String type = newDoc.get(TYPE); if (type == null) throw new UnsupportedOperationException("Document needs to have a type associated"); Mapping m = getMapping(type); latestGen = writer.updateDocument(new Term(ID, LuceneHelper.newRefFromLong(id)), newDoc, m.getCombinedAnalyzer()); getCurrentRTCache(latestGen).put(id, new IndexOp(newDoc, IndexOp.Type.UPDATE)); return latestGen; } catch (Exception ex) { throw new RuntimeException(ex); } } public long put(String uId, long id, Document newDoc) { String type = newDoc.get(TYPE); if (type == null) throw new UnsupportedOperationException("Document needs to have a type associated"); Mapping m = getMapping(type); if (newDoc.get(ID) == null) newDoc.add(m.newIdField(ID, id)); if (newDoc.get(UID) == null) newDoc.add(m.newUIdField(UID, uId)); return fastPut(id, newDoc); } void refresh() { try { // use waitForGeneration instead? // writer.commit(); writer.getIndexWriter().commit(); nrtManager.maybeRefreshBlocking(); // nrtManager.waitForGeneration(latestGen, true); } catch (Exception ex) { throw new RuntimeException(); } } /** * You'll need to call releaseUnmanagedSearcher afterwards */ IndexSearcher newUnmanagedSearcher() { return nrtManager.acquire(); } void releaseUnmanagedSearcher(IndexSearcher searcher) { try { nrtManager.release(searcher); } catch (IOException ex) { throw new RuntimeException(ex); } } void removeDoc(Document doc) { removeById(getId(doc)); } void indexLock() { indexRWLock.writeLock().lock(); } void indexUnlock() { indexRWLock.writeLock().unlock(); } @Override public String toString() { return name; } void initRelation(Document edgeDoc, Document vOut, Document vIn) { long oIndex = getId(vOut); edgeDoc.add(defaultMapping.newIdField(VERTEX_OUT, oIndex)); long iIndex = getId(vIn); edgeDoc.add(defaultMapping.newIdField(VERTEX_IN, iIndex)); long eId = getId(edgeDoc); vOut.add(defaultMapping.newIdField(EDGE_OUT, eId)); vIn.add(defaultMapping.newIdField(EDGE_IN, eId)); fastPut(oIndex, vOut); fastPut(iIndex, vIn); } static String getVertexFieldForEdgeType(String edgeType) { if (EDGE_IN.equals(edgeType)) return VERTEX_IN; else if (EDGE_OUT.equals(edgeType)) return VERTEX_OUT; else throw new UnsupportedOperationException("Edge type not supported:" + edgeType); } /** * @return never null. Automatically creates a mapping if it does not exist. */ public Mapping getMapping(Class cl) { return getMapping(cl.getSimpleName()); } public Mapping getMapping(String type) { if (type == null) throw new NullPointerException("Type mustn't be empty!"); Mapping m = mappings.get(type); if (m == null) { mappings.put(type, m = new Mapping(type)); if (logger.isDebugEnabled()) logger.debug("Created mapping for type " + type); } return m; } private Map<Long, IndexOp> tmpCache; private long tmpGen = -2; private Map<Long, IndexOp> getCurrentRTCache(long gen) { if (gen > tmpGen) synchronized (realTimeCache) { tmpGen = gen; tmpCache = new ConcurrentHashMap<Long, IndexOp>(100); realTimeCache.put(gen, tmpCache); } return tmpCache; } private class FlushThread extends Thread { public FlushThread(String name) { super(name); } @Override public void run() { Throwable exception = null; while (!isInterrupted()) { try { cleanUpCache(latestGen); } catch (InterruptedException ex) { exception = ex; break; } catch (AlreadyClosedException ex) { exception = ex; break; } catch (OutOfMemoryError er) { logger.error("Now closing writer due to OOM", er); try { writer.getIndexWriter().close(); } catch (Exception ex) { logger.error("Error while closing writer", ex); } exception = er; break; } catch (Exception ex) { logger.error("Problem while flushing", ex); } } logger.debug("flush-thread interrupted, " + ((exception == null) ? "" : exception.getMessage()) + ", buffer:" + calcSize()); } } /** * Nearly always faster than flush but slightly more expensive as it will force the nrtManager * to reopen a reader very fast */ void waitUntilSearchable() { nrtManager.waitForGeneration(latestGen); } public void flush() { try { cleanUpCache(latestGen); } catch (InterruptedException ex) { throw new RuntimeException(ex); } } /** * Very slow compared to waitUntilSearchable but slightly more efficient (~3%) for indexing and * so it is suited for our background thread */ void cleanUpCache(long gen) throws InterruptedException { cleanUpCache(gen, Math.round(ordinaryWaiting * 1000)); } void cleanUpCache(long gen, long waiting) throws InterruptedException { if (nrtManager.getCurrentSearchingGen() >= gen) { // do not max out the CPU if called in a loop Thread.sleep(20); return; } // avoid nrtManager.waitForGeneration as we would force the reader to reopen too fast Thread.sleep(waiting); // nrtManager.waitForGeneration(gen, true); int removed = 0; int removedItems = 0; Iterator<Entry<Long, Map<Long, IndexOp>>> iter = realTimeCache.entrySet().iterator(); while (iter.hasNext()) { Entry<Long, Map<Long, IndexOp>> e = iter.next(); if (e.getKey() < gen) { iter.remove(); removed++; removedItems += e.getValue().size(); e.getValue().clear(); } } // if (removed > 0) // logger.info("removed objects " + removedItems + ", removed maps:" + removed + " older than gen:" + gen); } public double getRamBufferSizeMB() { return ramBufferSizeMB; } public void setRamBufferSizeMB(double ramBufferSizeMB) { this.ramBufferSizeMB = ramBufferSizeMB; } public int getTermIndexIntervalSize() { return termIndexIntervalSize; } public void setTermIndexIntervalSize(int termIndexIntervalSize) { this.termIndexIntervalSize = termIndexIntervalSize; } public void setMaxMergeMB(int maxMergeMB) { this.maxMergeMB = maxMergeMB; } public int getMaxMergeMB() { return maxMergeMB; } public long getLuceneAdds() { return luceneOperations; } public long getFailedLuceneReads() { return failedLuceneReads; } public long getSuccessfulLuceneReads() { return successfulLuceneReads; } public NRTManager getNrtManager() { return nrtManager; } }