lucandra.IndexReader.java Source code

Introduction

Here is the source code for lucandra.IndexReader.java
Source

/**
 * Copyright 2009 T Jake Luciani
 * 
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package lucandra;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.cassandra.thrift.Cassandra;
import org.apache.cassandra.thrift.Column;
import org.apache.cassandra.thrift.ColumnOrSuperColumn;
import org.apache.cassandra.thrift.ColumnParent;
import org.apache.cassandra.thrift.ConsistencyLevel;
import org.apache.cassandra.thrift.SlicePredicate;
import org.apache.cassandra.thrift.SliceRange;
import org.apache.cassandra.thrift.SuperColumn;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldSelector;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.index.TermVectorMapper;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;

import solandra.SolandraFieldSelector;

public class IndexReader extends org.apache.lucene.index.IndexReader {

    private final static int numDocs = 1000000;

    private final static Directory mockDirectory = new RAMDirectory();
    static {

        try {
            new IndexWriter(mockDirectory, new SimpleAnalyzer(), true, MaxFieldLength.LIMITED);
        } catch (CorruptIndexException e) {
            throw new RuntimeException(e);
        } catch (LockObtainFailedException e) {
            throw new RuntimeException(e);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private final String indexName;
    private final Cassandra.Iface client;

    private final ThreadLocal<Map<String, Integer>> docIdToDocIndex = new ThreadLocal<Map<String, Integer>>();
    private final ThreadLocal<Map<Integer, String>> docIndexToDocId = new ThreadLocal<Map<Integer, String>>();
    private final ThreadLocal<Map<Integer, Document>> documentCache = new ThreadLocal<Map<Integer, Document>>();
    private final ThreadLocal<AtomicInteger> docCounter = new ThreadLocal<AtomicInteger>();
    private final ThreadLocal<Map<Term, LucandraTermEnum>> termEnumCache = new ThreadLocal<Map<Term, LucandraTermEnum>>();
    private final ThreadLocal<Map<String, byte[]>> fieldNorms = new ThreadLocal<Map<String, byte[]>>();
    private final static ThreadLocal<Object> fieldCacheRefs = new ThreadLocal<Object>();

    private static final Logger logger = Logger.getLogger(IndexReader.class);

    public IndexReader(String name, Cassandra.Iface client) {
        super();
        this.indexName = name;
        this.client = client;
    }

    public synchronized IndexReader reopen() throws CorruptIndexException, IOException {

        clearCache();

        return this;
    }

    @Override
    public Object getFieldCacheKey() {

        Object ref = fieldCacheRefs.get();

        if (ref == null) {
            ref = UUID.randomUUID();
            fieldCacheRefs.set(ref);
        }

        return ref;
    }

    public void clearCache() {

        if (docCounter.get() != null)
            docCounter.get().set(0);
        if (docIdToDocIndex.get() != null)
            docIdToDocIndex.get().clear();
        if (docIndexToDocId.get() != null)
            docIndexToDocId.get().clear();
        if (termEnumCache.get() != null)
            termEnumCache.get().clear();
        if (documentCache.get() != null)
            documentCache.get().clear();
        if (fieldNorms.get() != null)
            fieldNorms.get().clear();
        if (fieldCacheRefs.get() != null)
            fieldCacheRefs.set(UUID.randomUUID());
    }

    protected void doClose() throws IOException {
        clearCache();
    }

    protected void doCommit() throws IOException {
        clearCache();
    }

    protected void doDelete(int arg0) throws CorruptIndexException, IOException {

    }

    protected void doSetNorm(int arg0, String arg1, byte arg2) throws CorruptIndexException, IOException {

    }

    protected void doUndeleteAll() throws CorruptIndexException, IOException {

    }

    public int docFreq(Term term) throws IOException {

        LucandraTermEnum termEnum = getTermEnumCache().get(term);
        if (termEnum == null) {

            long start = System.currentTimeMillis();

            termEnum = new LucandraTermEnum(this);
            termEnum.skipTo(term);

            long end = System.currentTimeMillis();

            logger.debug("docFreq() took: " + (end - start) + "ms");

            getTermEnumCache().put(term, termEnum);
        }

        return termEnum.docFreq();
    }

    public Document document(int docNum, FieldSelector selector) throws CorruptIndexException, IOException {

        Document doc = getDocumentCache().get(docNum);

        if (doc != null) {
            logger.debug("Found doc in cache");
            return doc;
        }

        String docId = getDocIndexToDocId().get(docNum);

        if (docId == null)
            return null;

        Map<Integer, String> keyMap = new HashMap<Integer, String>();

        keyMap.put(docNum, CassandraUtils.hashKey(indexName + CassandraUtils.delimeter + docId));

        List<byte[]> fieldNames = null;

        // Special field selector used to carry list of other docIds to cache in
        // Parallel for Solr Performance  
        if (selector != null && selector instanceof SolandraFieldSelector) {

            List<Integer> otherDocIds = ((SolandraFieldSelector) selector).getOtherDocsToCache();
            fieldNames = ((SolandraFieldSelector) selector).getFieldNames();

            logger.debug("Going to bulk load " + otherDocIds.size() + " documents");

            for (Integer otherDocNum : otherDocIds) {
                if (otherDocNum == docNum)
                    continue;

                if (getDocumentCache().containsKey(otherDocNum))
                    continue;

                String docKey = getDocIndexToDocId().get(otherDocNum);

                if (docKey == null)
                    continue;

                keyMap.put(otherDocNum, CassandraUtils.hashKey(indexName + CassandraUtils.delimeter + docKey));
            }
        }

        ColumnParent columnParent = new ColumnParent();
        columnParent.setColumn_family(CassandraUtils.docColumnFamily);

        SlicePredicate slicePredicate = new SlicePredicate();

        if (fieldNames == null || fieldNames.size() == 0) {
            // get all columns ( except this skips meta info )
            slicePredicate.setSlice_range(
                    new SliceRange(new byte[] {}, CassandraUtils.finalToken.getBytes("UTF-8"), false, 100));
        } else {

            slicePredicate.setColumn_names(fieldNames);
        }

        long start = System.currentTimeMillis();

        try {
            Map<String, List<ColumnOrSuperColumn>> docMap = client.multiget_slice(CassandraUtils.keySpace,
                    Arrays.asList(keyMap.values().toArray(new String[] {})), columnParent, slicePredicate,
                    ConsistencyLevel.ONE);

            for (Map.Entry<Integer, String> key : keyMap.entrySet()) {

                List<ColumnOrSuperColumn> cols = docMap.get(key.getValue());

                if (cols == null) {
                    logger.warn("Missing document in multiget_slice for: " + key.getValue());
                    continue;
                }

                Document cacheDoc = new Document();

                for (ColumnOrSuperColumn col : cols) {

                    Field field = null;
                    String fieldName = new String(col.column.name);

                    //Incase __META__ slips through
                    if (Arrays.equals(col.column.name, CassandraUtils.documentMetaField.getBytes())) {
                        logger.debug("Filtering out __META__ key");
                        continue;
                    }

                    byte[] value;

                    if (col.column.value[col.column.value.length - 1] != Byte.MAX_VALUE
                            && col.column.value[col.column.value.length - 1] != Byte.MIN_VALUE) {
                        throw new CorruptIndexException(
                                "Lucandra field is not properly encoded: " + docId + "(" + fieldName + ")");

                    } else if (col.column.value[col.column.value.length - 1] == Byte.MAX_VALUE) { //Binary
                        value = new byte[col.column.value.length - 1];
                        System.arraycopy(col.column.value, 0, value, 0, col.column.value.length - 1);

                        field = new Field(fieldName, value, Store.YES);
                        cacheDoc.add(field);
                    } else if (col.column.value[col.column.value.length - 1] == Byte.MIN_VALUE) { //String
                        value = new byte[col.column.value.length - 1];
                        System.arraycopy(col.column.value, 0, value, 0, col.column.value.length - 1);

                        //Check for multi-fields
                        String fieldString = new String(value, "UTF-8");

                        if (fieldString.indexOf(CassandraUtils.delimeter) >= 0) {
                            StringTokenizer tok = new StringTokenizer(fieldString, CassandraUtils.delimeter);
                            while (tok.hasMoreTokens()) {
                                field = new Field(fieldName, tok.nextToken(), Store.YES, Index.ANALYZED);
                                cacheDoc.add(field);
                            }
                        } else {

                            field = new Field(fieldName, fieldString, Store.YES, Index.ANALYZED);
                            cacheDoc.add(field);
                        }
                    }

                }

                //Mark the required doc
                if (key.getKey().equals(docNum))
                    doc = cacheDoc;

                getDocumentCache().put(key.getKey(), cacheDoc);
            }

            long end = System.currentTimeMillis();

            logger.debug("Document read took: " + (end - start) + "ms");

            return doc;

        } catch (Exception e) {
            throw new IOException(e.getLocalizedMessage());
        }

    }

    @Override
    public Collection getFieldNames(FieldOption fieldOption) {
        return Arrays.asList(new String[] {});
    }

    @Override
    public TermFreqVector getTermFreqVector(int docNum, String field) throws IOException {

        String docId = getDocIndexToDocId().get(docNum);

        TermFreqVector termVector = new lucandra.TermFreqVector(indexName, field, docId, client);

        return termVector;
    }

    @Override
    public void getTermFreqVector(int arg0, TermVectorMapper arg1) throws IOException {
        throw new RuntimeException();
    }

    @Override
    public void getTermFreqVector(int arg0, String arg1, TermVectorMapper arg2) throws IOException {

        throw new RuntimeException();

    }

    @Override
    public TermFreqVector[] getTermFreqVectors(int arg0) throws IOException {
        throw new RuntimeException();
    }

    @Override
    public boolean hasDeletions() {

        return false;
    }

    @Override
    public boolean isDeleted(int arg0) {

        return false;
    }

    @Override
    public int maxDoc() {
        // if (numDocs == null)
        // numDocs();

        return numDocs + 1;
    }

    @Override
    public byte[] norms(String field) throws IOException {
        return getFieldNorms().get(field);
    }

    @Override
    public void norms(String arg0, byte[] arg1, int arg2) throws IOException {

        throw new RuntimeException("This operation is not supported");

    }

    @Override
    public int numDocs() {

        return numDocs;
    }

    @Override
    public TermDocs termDocs() throws IOException {
        return new LucandraTermDocs(this);
    }

    @Override
    public TermPositions termPositions() throws IOException {
        return new LucandraTermDocs(this);
    }

    @Override
    public TermEnum terms() throws IOException {
        return new LucandraTermEnum(this);
    }

    @Override
    public TermEnum terms(Term term) throws IOException {

        LucandraTermEnum termEnum = getTermEnumCache().get(term);

        if (termEnum == null)
            termEnum = new LucandraTermEnum(this);

        if (!termEnum.skipTo(term)) //if found in the cache then reset, otherwise init.
            termEnum = null;

        return termEnum;
    }

    public int addDocument(SuperColumn docInfo, String field) {

        String id;
        try {
            id = new String(docInfo.name, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            throw new IllegalStateException("Cant make docId a string");
        }

        Integer idx = getDocIdToDocIndex().get(id);

        if (idx == null) {
            idx = getDocCounter().incrementAndGet();

            if (idx > numDocs)
                throw new IllegalStateException("numDocs reached");

            getDocIdToDocIndex().put(id, idx);
            getDocIndexToDocId().put(idx, id);

            Byte norm = null;
            for (Column c : docInfo.columns) {
                if (Arrays.equals(c.name, CassandraUtils.normsKey.getBytes())) {
                    if (c.value.length != 1)
                        throw new IllegalStateException("Norm for field " + field + " must be a single byte");

                    norm = c.value[0];
                }
            }

            if (norm == null)
                norm = Similarity.encodeNorm(1.0f);

            byte[] norms = getFieldNorms().get(field);

            if (norms == null)
                norms = new byte[1024];

            while (norms.length <= idx && norms.length < numDocs) {
                byte[] _norms = new byte[(norms.length * 2) < numDocs ? (norms.length * 2) : (numDocs + 1)];
                System.arraycopy(norms, 0, _norms, 0, norms.length);
                norms = _norms;
            }

            // find next empty position
            norms[idx] = norm;

            getFieldNorms().put(field, norms);
        }

        return idx;
    }

    public int getDocumentNumber(byte[] docId) {
        String id;
        try {
            id = new String(docId, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            throw new IllegalStateException("Cant make docId a string");
        }

        return getDocIdToDocIndex().get(id);
    }

    public String getDocumentId(int docNum) {
        return getDocIndexToDocId().get(docNum);
    }

    public String getIndexName() {
        return indexName;
    }

    public Cassandra.Iface getClient() {
        return client;
    }

    public LucandraTermEnum checkTermCache(Term term) {
        return getTermEnumCache().get(term);
    }

    public void addTermEnumCache(Term term, LucandraTermEnum termEnum) {
        getTermEnumCache().put(term, termEnum);
    }

    @Override
    public Directory directory() {
        clearCache();

        return mockDirectory;
    }

    @Override
    public long getVersion() {
        return 1;
    }

    @Override
    public boolean isOptimized() {
        return true;
    }

    @Override
    public boolean isCurrent() {
        return true;
    }

    public Map<Integer, String> getDocIndexToDocId() {
        Map<Integer, String> c = docIndexToDocId.get();

        if (c == null) {
            c = new HashMap<Integer, String>();
            docIndexToDocId.set(c);
        }

        return c;
    }

    private Map<String, Integer> getDocIdToDocIndex() {
        Map<String, Integer> c = docIdToDocIndex.get();

        if (c == null) {
            c = new HashMap<String, Integer>();
            docIdToDocIndex.set(c);
        }

        return c;
    }

    private AtomicInteger getDocCounter() {
        AtomicInteger c = docCounter.get();

        if (c == null) {
            c = new AtomicInteger(0);
            docCounter.set(c);
        }

        return c;
    }

    private Map<Term, LucandraTermEnum> getTermEnumCache() {
        Map<Term, LucandraTermEnum> c = termEnumCache.get();

        if (c == null) {
            c = new HashMap<Term, LucandraTermEnum>();
            termEnumCache.set(c);
        }

        return c;
    }

    private Map<Integer, Document> getDocumentCache() {
        Map<Integer, Document> c = documentCache.get();

        if (c == null) {
            c = new HashMap<Integer, Document>();
            documentCache.set(c);
        }

        return c;
    }

    private Map<String, byte[]> getFieldNorms() {
        Map<String, byte[]> c = fieldNorms.get();

        if (c == null) {
            c = new HashMap<String, byte[]>();
            fieldNorms.set(c);
        }

        return c;
    }

}