cn.hbu.cs.esearch.index.LuceneIndexDataLoader.java Source code

Introduction

Here is the source code for cn.hbu.cs.esearch.index.LuceneIndexDataLoader.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cn.hbu.cs.esearch.index;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.similarities.Similarity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import cn.hbu.cs.esearch.consumer.DataConsumer;
import cn.hbu.cs.esearch.core.EsearchException;
import cn.hbu.cs.esearch.core.EsearchHealth;
import cn.hbu.cs.esearch.core.EsearchSegmentReader;
import cn.hbu.cs.esearch.document.AbstractEsearchIndexable;
import cn.hbu.cs.esearch.document.EsearchIndexable;
import it.unimi.dsi.fastutil.longs.Long2ObjectMap;
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;

public abstract class LuceneIndexDataLoader<R extends IndexReader> implements DataConsumer<EsearchIndexable> {
    public static final Logger LOGGER = LoggerFactory.getLogger(LuceneIndexDataLoader.class);
    protected final Analyzer _analyzer;
    protected final Similarity _similarity;
    protected final SearchIndexManager<R> _idxMgr;
    protected final Comparator<String> _versionComparator;
    private Filter _purgeFilter;

    protected LuceneIndexDataLoader(Analyzer analyzer, Similarity similarity, SearchIndexManager<R> idxMgr,
            Comparator<String> versionComparator) {
        _analyzer = analyzer;
        _similarity = similarity;
        _idxMgr = idxMgr;
        _versionComparator = versionComparator;
        _purgeFilter = null;
    }

    public void setPurgeFilter(Filter purgeFilter) {
        _purgeFilter = purgeFilter;
    }

    protected abstract BaseSearchIndex<R> getSearchIndex();

    protected abstract void propagateDeletes(LongSet delDocs) throws IOException;

    protected abstract void commitPropagatedDeletes() throws IOException;

    private final void purgeDocuments() {
        if (_purgeFilter != null) {
            BaseSearchIndex<R> idx = getSearchIndex();
            IndexWriter writer = null;
            LOGGER.info("purging docs started...");
            int count = 0;
            long start = System.currentTimeMillis();

            try {
                writer = idx.openIndexWriter(null, null);
                ConstantScoreQuery q = new ConstantScoreQuery(_purgeFilter);
                writer.deleteDocuments(q);
                writer.commit();
            } catch (Throwable th) {
                LOGGER.error("problem creating purge filter: " + th.getMessage(), th);
            } finally {
                idx.closeIndexWriter();
            }

            long end = System.currentTimeMillis();
            LOGGER.info("purging docs completed in " + (end - start) + "ms");
            LOGGER.info("total docs purged: " + count);
        }
    }

    /**
     * @param events incoming events sorted by version number
     *               <br>every event in the events collection must be non-null
     *
     * @see cn.hbu.cs.esearch.consumer.DataConsumer#consume(java.util.Collection)
     */
    @Override
    public void consume(Collection<DataEvent<EsearchIndexable>> events) throws EsearchException {

        if (events == null) {
            return;
        }
        int eventCount = events.size();
        if (eventCount == 0) {
            return;
        }
        BaseSearchIndex<R> idx = getSearchIndex();

        if (idx == null) {
            throw new EsearchException("trying to consume to null index");
        }
        Long2ObjectMap<List<EsearchIndexable.IndexingReq>> addList = new Long2ObjectOpenHashMap<List<EsearchIndexable.IndexingReq>>();
        String version = idx.getVersion(); // current version

        LongSet delSet = new LongOpenHashSet();

        try {
            for (DataEvent<EsearchIndexable> evt : events) {
                if (evt == null) {
                    continue;
                }
                version = version == null ? evt.getVersion()
                        : (_versionComparator.compare(version, evt.getVersion()) < 0 ? evt.getVersion() : version);

                // interpret and get get the indexable instance
                EsearchIndexable indexable = evt.getData();
                if (indexable == null || indexable.isSkip()) {
                    continue;
                }

                long uid = indexable.getUID();
                delSet.add(uid);
                addList.remove(uid);
                if (!(indexable.isDeleted() || evt.isDelete())) // update event
                {
                    try {
                        EsearchIndexable.IndexingReq[] reqs = indexable.buildIndexingReqs();
                        for (EsearchIndexable.IndexingReq req : reqs) {
                            if (req != null) // if doc is provided, interpret as
                            // a delete, e.g. update with
                            // nothing
                            {
                                Document doc = req.getDocument();
                                if (doc != null) {
                                    EsearchSegmentReader.fillDocumentID(doc, uid);
                                    if (indexable.isStorable()) {
                                        byte[] bytes = indexable.getStoreValue();
                                        if (bytes != null) {
                                            doc.add(new StoredField(AbstractEsearchIndexable.DOCUMENT_STORE_FIELD,
                                                    bytes));
                                        }
                                    }
                                }
                                // add to the insert list
                                List<EsearchIndexable.IndexingReq> docList = addList.get(uid);
                                if (docList == null) {
                                    docList = new LinkedList<EsearchIndexable.IndexingReq>();
                                    addList.put(uid, docList);
                                }
                                docList.add(req);
                            }
                        }
                    } catch (Exception ex) {
                        LOGGER.error("Couldn't index the event with uid - " + uid, ex);
                    }
                }
            }

            List<EsearchIndexable.IndexingReq> docList = new ArrayList<EsearchIndexable.IndexingReq>(
                    addList.size());
            for (List<EsearchIndexable.IndexingReq> tmpList : addList.values()) {
                docList.addAll(tmpList);
            }

            purgeDocuments();
            idx.updateIndex(delSet, docList, _analyzer, _similarity);
            propagateDeletes(delSet);
            synchronized (_idxMgr) {
                idx.refresh();
                commitPropagatedDeletes();
            }
        } catch (IOException ioe) {
            EsearchHealth.setFatal();
            LOGGER.error("Problem indexing batch: " + ioe.getMessage(), ioe);
        } finally {
            try {
                if (idx != null) {
                    idx.setVersion(version);
                    idx.incrementEventCount(eventCount);
                }
            } catch (Exception e) // catch all exceptions, or it would screw
            // up jobs framework
            {
                LOGGER.warn(e.getMessage());
            } finally {
                if (idx instanceof DiskSearchIndex<?>) {
                    LOGGER.info("disk indexing requests flushed.");
                }
            }
        }
    }

    public void loadFromIndex(RAMSearchIndex<R> ramIndex) throws EsearchException {
        try {
            // get disk search idx,
            BaseSearchIndex<R> idx = getSearchIndex();
            // merge the realyOnly ram idx with the disk idx
            idx.loadFromIndex(ramIndex);
            // set new version
            String newVersion = idx.getVersion() == null ? ramIndex.getVersion()
                    : (_versionComparator.compare(idx.getVersion(), ramIndex.getVersion()) < 0
                            ? ramIndex.getVersion()
                            : idx.getVersion());
            idx.setVersion(newVersion);
            synchronized (_idxMgr) {
                // update the disk idx reader
                idx.refresh();
                purgeDocuments();
                // inherit deletes
                idx.markDeletes(ramIndex.getDelDocs());
                idx.commitDeletes();
                idx.incrementEventCount(ramIndex.getEventsHandled());
                _idxMgr.setDiskIndexerStatus(SearchIndexManager.Status.Sleep);
            }
        } catch (IOException ioe) {
            EsearchHealth.setFatal();
            LOGGER.error("Problem copying segments: " + ioe.getMessage(), ioe);
            throw new EsearchException(ioe);
        }
    }

    /**
     * @return the version number of the search index.
     */
    @Override
    public String getVersion() {
        BaseSearchIndex<R> idx = getSearchIndex();
        String version = null;
        if (idx != null) {
            version = idx.getVersion();
        }
        return version;
    }

    /**
     * @return the version comparator.
     */
    @Override
    public Comparator<String> getVersionComparator() {
        return _versionComparator;
    }
}