org.apache.gora.lucene.store.LuceneStore.java Source code

Introduction

Here is the source code for org.apache.gora.lucene.store.LuceneStore.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gora.lucene.store;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Type;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.avro.util.Utf8;
import org.apache.gora.lucene.query.LuceneQuery;
import org.apache.gora.lucene.query.LuceneResult;
import org.apache.gora.persistency.impl.PersistentBase;
import org.apache.gora.query.PartitionQuery;
import org.apache.gora.query.Query;
import org.apache.gora.query.Result;
import org.apache.gora.query.impl.FileSplitPartitionQuery;
import org.apache.gora.store.DataStore;
import org.apache.gora.store.DataStoreFactory;
import org.apache.gora.store.impl.FileBackedDataStoreBase;
import org.apache.gora.util.AvroUtils;
import org.apache.gora.util.GoraException;
import org.apache.gora.util.IOUtils;
import org.apache.gora.util.OperationNotSupportedException;
import org.apache.hadoop.conf.Configurable;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SearcherFactory;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.FileSystems;

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;

/**
 * {@link org.apache.gora.lucene.store.LuceneStore} is the primary class
 * responsible for GORA CRUD operations on Lucene.
 */
public class LuceneStore<K, T extends PersistentBase> extends FileBackedDataStoreBase<K, T>
        implements Configurable {

    private static final Logger LOG = LoggerFactory.getLogger(LuceneStore.class);

    private static final String DEFAULT_MAPPING_FILE = "gora-lucene-mapping.xml";
    private static final String LUCENE_VERSION_KEY = "gora.lucene.index.version";
    private static final String DEFAULT_LUCENE_VERSION = "LATEST";
    private static final String LUCENE_RAM_BUFFER_KEY = "gora.lucene.index.writer.rambuffer";
    private static final String DEFAULT_LUCENE_RAMBUFFER = "16";

    private LuceneMapping mapping;
    private IndexWriter writer;
    private SearcherManager searcherManager;
    private Directory dir;

    @Override
    public void initialize(Class<K> keyClass, Class<T> persistentClass, Properties properties)
            throws GoraException {
        try {
            super.initialize(keyClass, persistentClass, properties);
        } catch (GoraException ge) {
            LOG.error(ge.getMessage(), ge);
            throw new GoraException(ge);
        }

        String mappingFile = null;
        try {
            mappingFile = DataStoreFactory.getMappingFile(properties, (DataStore<?, ?>) this, DEFAULT_MAPPING_FILE);
        } catch (IOException ioe) {
            LOG.error(ioe.getMessage(), ioe);
            throw new GoraException(ioe);
        }
        String luceneVersion = properties.getProperty(LUCENE_VERSION_KEY, DEFAULT_LUCENE_VERSION);
        String ramBuffer = properties.getProperty(LUCENE_RAM_BUFFER_KEY, DEFAULT_LUCENE_RAMBUFFER);

        LOG.debug("Lucene index version: {}", luceneVersion);
        LOG.debug("Lucene index writer RAM buffer size: {}", ramBuffer);

        try {
            mapping = readMapping(mappingFile);
        } catch (IOException ioe) {
            LOG.error(ioe.getMessage(), ioe);
            throw new GoraException(ioe);
        }
        String persistentClassObject = persistentClass.getCanonicalName();
        String dataStoreOutputPath = outputPath + "_" + persistentClassObject
                .substring(persistentClassObject.lastIndexOf('.') + 1).toLowerCase(Locale.getDefault());
        try {
            dir = FSDirectory.open(FileSystems.getDefault().getPath(dataStoreOutputPath));

            Analyzer analyzer = new StandardAnalyzer();
            IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
            iwc.setRAMBufferSizeMB(Double.parseDouble(ramBuffer));

            writer = new IndexWriter(dir, iwc);
            //TODO do we definately want all past deletions to be applied.
            searcherManager = new SearcherManager(writer, true, true, new SearcherFactory());
        } catch (IOException e) {
            LOG.error("Error opening {} with Lucene FSDirectory.", outputPath, e);
        }
    }

    private LuceneMapping readMapping(String filename) throws IOException {
        try {

            LuceneMapping mapping = new LuceneMapping();

            DocumentBuilder db = DocumentBuilderFactory.newInstance().newDocumentBuilder();
            org.w3c.dom.Document dom = db.parse(getClass().getClassLoader().getResourceAsStream(filename));

            Element root = dom.getDocumentElement();

            NodeList nl = root.getElementsByTagName("class");
            for (int i = 0; i < nl.getLength(); i++) {

                Element classElement = (Element) nl.item(i);
                if (classElement.getAttribute("keyClass").equals(keyClass.getCanonicalName())
                        && classElement.getAttribute("name").equals(persistentClass.getCanonicalName())) {

                    NodeList fields;
                    Element fe;

                    fields = classElement.getElementsByTagName("primarykey");
                    for (int j = 0; j < fields.getLength(); j++) {
                        fe = (Element) fields.item(j);
                        mapping.setPrimaryKey(fe.getAttribute("column"));
                    }

                    fields = classElement.getElementsByTagName("field");
                    for (int j = 0; j < fields.getLength(); j++) {
                        fe = (Element) fields.item(j);

                        String name = fe.getAttribute("name");
                        String column = fe.getAttribute("column");

                        mapping.addField(name, column);
                    }
                }
            }
            return mapping;
        } catch (Exception ex) {
            throw new IOException("Unable to read " + filename, ex);
        }
    }

    @Override
    public boolean delete(K key) {
        try {
            writer.deleteDocuments(new Term(mapping.getPrimaryKey(), key.toString()));
            searcherManager.maybeRefresh();
            return true;
        } catch (IOException e) {
            LOG.error("Unable to delete key: {}", key.toString(), e);
        }
        return false;
    }

    private boolean isPrimaryKeyIncluded(String[] fields) {
        HashSet<String> luceneFields = new HashSet<>();
        if (fields.length > 0) {
            for (String field : fields) {
                luceneFields.add(getMapping().getLuceneField(field));
            }
        }
        return luceneFields.contains(getMapping().getPrimaryKey());
    }

    @Override
    public long deleteByQuery(Query<K, T> query) {
        try {
            // Figure out how many were there before
            LuceneQuery<K, T> q = (LuceneQuery<K, T>) query;
            LuceneResult<K, T> r = (LuceneResult<K, T>) q.execute();
            int before = r.getScoreDocs().length;

            if (query.getFields() == null || (query.getFields().length == getFields().length)
                    || isPrimaryKeyIncluded(query.getFields())) {
                // Delete them
                writer.deleteDocuments(q.toLuceneQuery());
                searcherManager.maybeRefresh();
            } else {
                Query<K, T> selectQuery = this.newQuery();
                selectQuery.setStartKey(q.getStartKey());
                selectQuery.setEndKey(q.getEndKey());
                LuceneResult<K, T> selectResult = (LuceneResult<K, T>) selectQuery.execute();
                ScoreDoc[] scoreDocs = selectResult.getScoreDocs();
                HashSet<String> fields = new HashSet<>();
                fields.addAll(mapping.getLuceneFields());
                IndexSearcher searcher = selectResult.getSearcher();
                if (scoreDocs.length > 0) {
                    for (ScoreDoc scoreDoc : scoreDocs) {
                        Document doc = searcher.doc(scoreDoc.doc, fields);
                        for (String avroField : query.getFields()) {
                            String docField = mapping.getLuceneField(avroField);
                            if (doc.getField(docField) != null) {
                                doc.removeField(docField);
                            }
                        }
                        String key = doc.get(getMapping().getPrimaryKey());
                        doc.add(new StringField(mapping.getPrimaryKey(), key, Store.YES));
                        writer.updateDocument(new Term(mapping.getPrimaryKey(), key), doc);
                        searcherManager.maybeRefresh();
                    }
                }
                selectResult.close();
            }

            // Figure out how many there are after
            r = (LuceneResult<K, T>) q.execute();
            int after = r.getScoreDocs().length;

            return before - after;
        } catch (IOException e) {
            LOG.error("Unable to deleteByQuery: {}", query.toString(), e);
        }
        return 0;
    }

    @Override
    public void deleteSchema() {
        try {
            writer.deleteAll();
            searcherManager.maybeRefresh();
        } catch (IOException e) {
            LOG.error("Unable to deleteAll: {}", e);
        }
    }

    @Override
    public T get(K key, String[] fieldsToLoad) {

        Set<String> fields;
        if (fieldsToLoad != null) {
            fields = new HashSet<>(fieldsToLoad.length);
            fields.addAll(Arrays.asList(fieldsToLoad));
        } else {
            fields = new HashSet<>();
            fields.addAll(mapping.getLuceneFields());
        }
        try {
            final IndexSearcher s = searcherManager.acquire();
            TermQuery q = new TermQuery(new Term(mapping.getPrimaryKey(), key.toString()));
            ScoreDoc[] hits = s.search(q, 2).scoreDocs;
            if (hits.length > 0) {
                Document doc = s.doc(hits[0].doc, fields);
                LOG.debug("get:Document: {}", doc.toString());
                String[] a = {};
                return newInstance(doc, fields.toArray(a));
            }
            searcherManager.release(s);
        } catch (IOException e) {
            LOG.error("Error in get: {}", e);
        }
        return null;
    }

    private Object convertDocFieldToAvroUnion(final Schema fieldSchema, final Schema.Field field, final String sf,
            final Document doc) throws IOException {
        Object result;
        Schema.Type type0 = fieldSchema.getTypes().get(0).getType();
        Schema.Type type1 = fieldSchema.getTypes().get(1).getType();

        if (!type0.equals(type1) && (type0.equals(Schema.Type.NULL) || type1.equals(Schema.Type.NULL))) {
            Schema innerSchema = null;
            if (type0.equals(Schema.Type.NULL)) {
                innerSchema = fieldSchema.getTypes().get(1);
            } else {
                innerSchema = fieldSchema.getTypes().get(0);
            }

            result = convertToIndexableFieldToAvroField(doc, field, innerSchema, sf);
        } else {
            throw new GoraException("LuceneStore only supports Union of two types field.");
        }
        return result;
    }

    private SpecificDatumReader getDatumReader(Schema fieldSchema) {
        // reuse
        return new SpecificDatumReader(fieldSchema);
    }

    private Object convertToIndexableFieldToAvroField(final Document doc, final Schema.Field field,
            final Schema fieldSchema, final String sf) throws IOException {
        Object result = null;
        T persistent = newPersistent();
        Object sv;
        switch (fieldSchema.getType()) {
        case MAP:
        case ARRAY:
        case RECORD:
            sv = doc.getBinaryValue(sf);
            if (sv == null) {
                break;
            }
            BytesRef b = (BytesRef) sv;
            SpecificDatumReader reader = getDatumReader(fieldSchema);
            result = IOUtils.deserialize(b.bytes, reader, persistent.get(field.pos()));
            break;
        case UNION:
            result = convertDocFieldToAvroUnion(fieldSchema, field, sf, doc);
            break;
        case ENUM:
            sv = doc.get(sf);
            if (sv == null) {
                break;
            }
            result = AvroUtils.getEnumValue(fieldSchema, (String) sv);
            break;
        case BYTES:
            sv = doc.getBinaryValue(sf);
            if (sv == null) {
                break;
            }
            result = ByteBuffer.wrap(((BytesRef) sv).bytes);
            break;
        default:
            sv = doc.get(sf);
            if (sv == null) {
                break;
            }
            result = convertLuceneFieldToAvroField(fieldSchema.getType(), sv);
        }
        return result;
    }

    public T newInstance(Document doc, String[] fields) throws IOException {
        T persistent = newPersistent();
        if (fields == null) {
            fields = fieldMap.keySet().toArray(new String[fieldMap.size()]);
        }
        String pk = mapping.getPrimaryKey();

        for (String f : fields) {
            org.apache.avro.Schema.Field field = fieldMap.get(f);
            String sf;
            if (pk.equals(f)) {
                sf = f;
            } else {
                sf = mapping.getLuceneField(f);
            }
            Schema fieldSchema = field.schema();
            Object fieldValue = convertToIndexableFieldToAvroField(doc, field, fieldSchema, sf);
            if (fieldValue == null) {
                continue;
            }
            persistent.put(field.pos(), fieldValue);
            persistent.setDirty(field.pos());
        }
        persistent.clearDirty();
        return persistent;
    }

    private Object convertLuceneFieldToAvroField(Type t, Object o) {
        Object result = null;
        switch (t) {
        case FIXED:
            // Could we combine this with the BYTES section below and
            // either fix the size of the array or not depending on Type?
            // This might be a buffer copy. Do we need to pad if the
            // fixed sized data is smaller than the type? Do we truncate
            // if the data is larger than the type?
            LOG.error("Fixed-sized fields are not supported yet");
            break;
        case BOOLEAN:
            result = Boolean.parseBoolean((String) o);
            break;
        case DOUBLE:
            result = Double.parseDouble((String) o);
            break;
        case FLOAT:
            result = Float.parseFloat((String) o);
            break;
        case INT:
            result = Integer.parseInt((String) o);
            break;
        case LONG:
            result = Long.parseLong((String) o);
            break;
        case STRING:
            result = new Utf8(o.toString());
            break;
        default:
            LOG.error("Unknown field type: {}", t);
        }
        return result;
    }

    @Override
    public String getSchemaName() {
        return "default";
    }

    @Override
    public Query<K, T> newQuery() {
        return new LuceneQuery<>(this);
    }

    private IndexableField convertAvroUnionToDocumentField(final String sf, final Schema fieldSchema,
            final Object value) {
        IndexableField result;
        Schema.Type type0 = fieldSchema.getTypes().get(0).getType();
        Schema.Type type1 = fieldSchema.getTypes().get(1).getType();

        if (!type0.equals(type1) && (type0.equals(Schema.Type.NULL) || type1.equals(Schema.Type.NULL))) {
            Schema innerSchema = null;
            if (type0.equals(Schema.Type.NULL)) {
                innerSchema = fieldSchema.getTypes().get(1);
            } else {
                innerSchema = fieldSchema.getTypes().get(0);
            }
            result = convertToIndexableField(sf, innerSchema, value);
        } else {
            throw new IllegalStateException("LuceneStore only supports Union of two types field.");
        }
        return result;
    }

    private SpecificDatumWriter getDatumWriter(Schema fieldSchema) {
        return new SpecificDatumWriter(fieldSchema);
    }

    private IndexableField convertToIndexableField(String sf, Schema fieldSchema, Object o) {
        IndexableField result = null;
        switch (fieldSchema.getType()) {
        case MAP: //TODO: These should be handled better
        case ARRAY:
        case RECORD:
            // For now we'll just store the bytes
            byte[] data = new byte[0];
            try {
                SpecificDatumWriter writer = getDatumWriter(fieldSchema);
                data = IOUtils.serialize(writer, o);
            } catch (IOException e) {
                LOG.error("Error occurred while serializing record", e);
            }
            result = new StoredField(sf, data);
            break;
        case UNION:
            result = convertAvroUnionToDocumentField(sf, fieldSchema, o);
            break;
        case BYTES:
            result = new StoredField(sf, ((ByteBuffer) o).array());
            break;
        case ENUM:
        case STRING:
            //TODO make this Text based on a mapping.xml attribute
            result = new StringField(sf, o.toString(), Store.YES);
            break;
        case BOOLEAN:
            result = new StringField(sf, o.toString(), Store.YES);
            break;
        case DOUBLE:
            result = new StoredField(sf, (Double) o);
            break;
        case FLOAT:
            result = new StoredField(sf, (Float) o);
            break;
        case INT:
            result = new StoredField(sf, (Integer) o);
            break;
        case LONG:
            result = new StoredField(sf, (Long) o);
            break;
        default:
            LOG.error("Unknown field type: {}", fieldSchema.getType());
        }
        return result;
    }

    @Override
    public void put(K key, T persistent) {
        Schema schema = persistent.getSchema();
        Document doc = new Document();

        // populate the doc
        List<org.apache.avro.Schema.Field> fields = schema.getFields();
        for (org.apache.avro.Schema.Field field : fields) {
            if (!persistent.isDirty(field.name())) {
                continue;
            }
            String sf = mapping.getLuceneField(field.name());
            if (sf == null) {
                continue;
            }
            Schema fieldSchema = field.schema();
            Object o = persistent.get(field.pos());
            if (o == null) {
                continue;
            }
            doc.add(convertToIndexableField(sf, fieldSchema, o));
        }
        LOG.info("DOCUMENT: {}", doc);
        try {
            doc.add(new StringField(mapping.getPrimaryKey(), key.toString(), Store.YES));
            LOG.info("DOCUMENT: {}", doc);
            if (get(key, null) == null) {
                writer.addDocument(doc);
            } else {
                writer.updateDocument(new Term(mapping.getPrimaryKey(), key.toString()), doc);
            }
            searcherManager.maybeRefresh();
        } catch (IOException e) {
            LOG.error("Error updating document: {}", e);
        }
    }

    @Override
    protected Result<K, T> executePartial(FileSplitPartitionQuery<K, T> arg0) throws IOException {
        throw new OperationNotSupportedException("executePartial is not supported for LuceneStore");
    }

    @Override
    protected Result<K, T> executeQuery(Query<K, T> query) throws IOException {
        try {
            return new LuceneResult<>(this, query, searcherManager);
        } catch (IOException e) {
            LOG.error(e.getMessage(), e);
        }
        return null;
    }

    @Override
    public List<PartitionQuery<K, T>> getPartitions(Query<K, T> query) {
        throw new OperationNotSupportedException("getPartitions is not supported for LuceneStore");
    }

    @Override
    public void flush() {
        try {
            writer.commit();
            searcherManager.maybeRefreshBlocking();
        } catch (IOException e) {
            LOG.error("Error in commit: {}", e);
        }
    }

    @Override
    public void close() {
        try {
            searcherManager.close();
            writer.close();
            dir.close();
        } catch (IOException e) {
            LOG.error("Error in close: {}", e);
        }
        super.close();
    }

    public LuceneMapping getMapping() {
        return mapping;
    }

}