org.waveprotocol.box.server.search.LuceneSearchImpl.java Source code

Java tutorial

Introduction

Here is the source code for org.waveprotocol.box.server.search.LuceneSearchImpl.java

Source

/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package org.waveprotocol.box.server.search;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.ListenableFutureTask;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import com.google.wave.api.SearchResult;
import com.google.wave.api.SearchResult.Digest;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.FieldCache.LongParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NRTManager;
import org.apache.lucene.search.NRTManagerReopenThread;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.SearcherWarmer;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util.Version;

import org.waveprotocol.box.common.DeltaSequence;
import org.waveprotocol.box.search.query.QueryParser;
import org.waveprotocol.box.search.query.SearchQuery;
import org.waveprotocol.box.server.CoreSettings;
import org.waveprotocol.box.server.executor.ExecutorAnnotations;
import org.waveprotocol.box.server.persistence.lucene.IndexDirectory;
import org.waveprotocol.box.server.shutdown.ShutdownManager;
import org.waveprotocol.box.server.shutdown.ShutdownPriority;
import org.waveprotocol.box.server.shutdown.Shutdownable;
import org.waveprotocol.box.server.util.regexp.RegExpWrapFactoryImpl;
import org.waveprotocol.box.server.waveserver.PerUserWaveViewHandler;
import org.waveprotocol.box.server.waveserver.TextCollator;
import org.waveprotocol.box.server.waveserver.WaveDigester;
import org.waveprotocol.box.server.waveserver.WaveMap;
import org.waveprotocol.box.server.waveserver.WaveServerException;
import org.waveprotocol.box.server.waveletstate.WaveletStateException;
import org.waveprotocol.wave.model.id.IdUtil;
import org.waveprotocol.wave.model.id.WaveId;
import org.waveprotocol.wave.model.id.WaveletName;
import org.waveprotocol.wave.model.supplement.WaveDigestSupplement;
import org.waveprotocol.wave.model.supplement.WaveDigestWithSupplements;
import org.waveprotocol.wave.model.version.HashedVersion;
import org.waveprotocol.wave.model.wave.ParticipantId;
import org.waveprotocol.wave.model.wave.ParticipantIdUtil;
import org.waveprotocol.wave.model.wave.WaveDigest;
import org.waveprotocol.wave.model.wave.data.ObservableWaveletData;
import org.waveprotocol.wave.model.wave.data.ReadableWaveletData;
import org.waveprotocol.wave.model.wave.data.WaveViewData;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.waveprotocol.box.server.waveletstate.IndexingInProcessException;
import org.waveprotocol.wave.model.operation.OperationException;

/**
 * Lucene based implementation of {@link PerUserWaveViewHandler}.
 *
 * @author yurize@apache.org (Yuri Zelikov)
 * @author akaplanov@gmail.com (Andrew Kaplanov)
 */
@Singleton
public class LuceneSearchImpl implements WaveIndexer, SearchProvider, SearchBusSubscriber, Closeable {

    private static final Logger LOG = Logger.getLogger(LuceneSearchImpl.class.getName());

    private static class WaveSearchWarmer implements SearcherWarmer {

        WaveSearchWarmer(String waveDomain) {
        }

        @Override
        public void warm(IndexSearcher searcher) throws IOException {
            // TODO (Yuri Z): Run some diverse searches, searching against all
            // fields.
        }
    }

    private LongParser longParser = new LongParser() {

        @Override
        public long parseLong(String value) {
            if (value == null || value.isEmpty()) {
                return 0;
            }
            return Long.parseLong(value);
        }
    };

    /** Current indexing waves **/
    private static ConcurrentHashMap<WaveId, ListenableFutureTask<Void>> indexingWaves = new ConcurrentHashMap<WaveId, ListenableFutureTask<Void>>();

    /** Commit task **/
    AtomicReference<ListenableFutureTask<Void>> commitTask = new AtomicReference<ListenableFutureTask<Void>>();

    /** Delay between wave indexing **/
    private static final long WAVE_INDEXING_DELAY_SEC = 10;

    /** Delay between commit **/
    private static final long WAVE_COMMIT_DELAY_SEC = 60;

    private static final Version LUCENE_VERSION = Version.LUCENE_35;

    /** Minimum time until a new reader can be opened. */
    private static final double MIN_STALE_SEC = 0.025;

    /** Maximum time until a new reader must be opened. */
    private static final double MAX_STALE_SEC = 1.0;

    private final QueryParser queryParser = new QueryParser(new RegExpWrapFactoryImpl());

    private final Analyzer analyzer;
    private final TextCollator textCollator;
    private final IndexWriter indexWriter;
    private final ScheduledExecutorService indexExecutor;
    private final Similarity similarity;
    private final NRTManager nrtManager;
    private final NRTManagerReopenThread nrtManagerReopenThread;
    private final WaveMap waveMap;
    private final WaveDigester digester;
    private final String waveDomain;
    private final String sharedDomainParticipant;
    private boolean isClosed = false;

    @Inject
    public LuceneSearchImpl(IndexDirectory directory, WaveMap waveMap, WaveDigester digester,
            TextCollator textCollator, @Named(CoreSettings.WAVE_SERVER_DOMAIN) final String waveDomain,
            @ExecutorAnnotations.IndexExecutor ScheduledExecutorService indexExecutor) {
        this.textCollator = textCollator;
        this.waveMap = waveMap;
        this.digester = digester;
        this.waveDomain = waveDomain;
        this.indexExecutor = indexExecutor;
        sharedDomainParticipant = ParticipantIdUtil.makeUnsafeSharedDomainParticipantId(waveDomain).getAddress();
        analyzer = new StandardAnalyzer(LUCENE_VERSION, StandardAnalyzer.STOP_WORDS_SET);
        similarity = new DefaultSimilarity() {
            @Override
            public float computeNorm(String string, FieldInvertState fis) {
                return fis.getBoost();
            }

            @Override
            public float tf(float freq) {
                return freq > 0 ? 1.0f : 0.0f;
            }

            @Override
            public float tf(int freq) {
                return freq > 0 ? 1 : 0;
            }

        };
        try {
            IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, analyzer);
            config.setOpenMode(OpenMode.CREATE_OR_APPEND);
            config.setSimilarity(similarity);
            indexWriter = new IndexWriter(directory.getDirectory(), config);
            nrtManager = new NRTManager(indexWriter, new WaveSearchWarmer(waveDomain));
        } catch (IOException ex) {
            throw new IndexException(ex);
        }

        nrtManagerReopenThread = new NRTManagerReopenThread(nrtManager, MAX_STALE_SEC, MIN_STALE_SEC);
        nrtManagerReopenThread.start();

        ShutdownManager.getInstance().register(new Shutdownable() {

            @Override
            public void shutdown() throws Exception {
                synchronized (LuceneSearchImpl.this) {
                    if (!isClosed) {
                        close();
                    }
                }
            }
        }, LuceneSearchImpl.class.getSimpleName(), ShutdownPriority.Storage);
    }

    /**
     * Closes the handler, releases resources and flushes the recent index changes
     * to persistent storage.
     */
    @Override
    public synchronized void close() {
        if (isClosed) {
            throw new AlreadyClosedException("Already closed");
        }
        isClosed = true;
        try {
            nrtManager.close();
            if (analyzer != null) {
                analyzer.close();
            }
            nrtManagerReopenThread.close();
            indexWriter.close();
        } catch (IOException ex) {
            LOG.log(Level.SEVERE, "Failed to close the Lucene index", ex);
        }
        LOG.info("Successfully closed the Lucene index...");
    }

    /**
     * Ensures that the index is up to date. Exits quickly if no changes were done
     * to the index.
     *
     * @throws IOException if something goes wrong.
     */
    public void forceReopen() throws IOException {
        nrtManager.maybeReopen(true);
    }

    @Override
    public synchronized void updateIndex(WaveId waveId) throws WaveletStateException, WaveServerException {
        Preconditions.checkNotNull(waveId);
        try {
            WaveViewData waveData = waveMap.getWaveViewData(waveId);
            // TODO (Yuri Z): Update documents instead of totally removing and adding.
            LOG.info("Updating index for wave " + waveId.serialise());
            removeIndex(waveId);
            addIndex(waveId, waveData);
            sheduleCommitIndex();
            LOG.info("Index for wave " + waveId.serialise() + " has been updated");
        } catch (CorruptIndexException e) {
            throw new IndexException(waveId.serialise(), e);
        } catch (IOException e) {
            throw new IndexException(waveId.serialise(), e);
        }
    }

    @Override
    public void waveletUpdate(WaveletName waveletName, DeltaSequence deltas) {
        sheduleUpdateIndex(waveletName.waveId);
    }

    @Override
    public void waveletCommitted(WaveletName waveletName, HashedVersion version) {
    }

    private ListenableFutureTask<Void> sheduleUpdateIndex(final WaveId waveId) {
        synchronized (indexingWaves) {
            ListenableFutureTask<Void> task = indexingWaves.get(waveId);
            if (task == null) {
                task = ListenableFutureTask.create(new Callable<Void>() {

                    @Override
                    public Void call() throws Exception {
                        indexingWaves.remove(waveId);
                        try {
                            updateIndex(waveId);
                        } catch (IndexingInProcessException e) {
                            sheduleUpdateIndex(waveId);
                        } catch (Throwable e) {
                            LOG.log(Level.SEVERE, "Failed to update index for " + waveId.serialise(), e);
                            throw e;
                        }
                        return null;
                    }
                });
                indexingWaves.put(waveId, task);
                indexExecutor.schedule(task, WAVE_INDEXING_DELAY_SEC, TimeUnit.SECONDS);
            }
            return task;
        }
    }

    private ListenableFutureTask<Void> sheduleCommitIndex() {
        ListenableFutureTask<Void> task = commitTask.get();
        if (task == null) {
            task = ListenableFutureTask.create(new Callable<Void>() {

                @Override
                public Void call() throws Exception {
                    commitTask.set(null);
                    try {
                        LOG.info("Commiting indexes...");
                        indexWriter.commit();
                        LOG.info("Commiting indexes is complete");
                    } catch (IndexException e) {
                        LOG.log(Level.SEVERE, "Index commit failed", e);
                        throw e;
                    }
                    return null;
                }
            });
            commitTask.set(task);
            indexExecutor.schedule(task, WAVE_COMMIT_DELAY_SEC, TimeUnit.SECONDS);
        }
        return task;
    }

    private void addIndex(WaveId waveId, WaveViewData waveData)
            throws CorruptIndexException, IOException, WaveletStateException {
        Document doc = new Document();
        addWaveFieldsToIndex(waveId, waveData, doc);
        nrtManager.addDocument(doc);
    }

    private void addWaveFieldsToIndex(WaveId waveId, WaveViewData waveData, Document doc)
            throws WaveletStateException {
        WaveDigestWithSupplements digestWithSupplements = digester.generateDigestWithSupplements(waveData);
        WaveDigest digest = digestWithSupplements.getDigest();
        Map<ParticipantId, WaveDigestSupplement> supplements = digestWithSupplements.getSupplements();
        if (digest.getParticipants().size() > 0) {
            addField(doc, IndexCondition.Field.WAVE_ID, digest.getWaveId());
            addField(doc, IndexCondition.Field.CREATOR, digest.getCreator());
            for (String participant : digest.getParticipants()) {
                doc.add(new Field(IndexCondition.Field.PARTICIPANTS.toString(), participant, Field.Store.YES,
                        Field.Index.NOT_ANALYZED_NO_NORMS));
            }
            addField(doc, IndexCondition.Field.TITLE, digest.getTitle());
            addField(doc, IndexCondition.Field.CONTENT, digest.getTitle());
            addField(doc, IndexCondition.Field.SNIPPET, digest.getSnippet());
            addField(doc, IndexCondition.Field.BLIP_COUNT, Integer.toString(digest.getBlipCount()));
            for (ReadableWaveletData wavelet : waveData.getWavelets()) {
                if (IdUtil.isConversationalId(wavelet.getWaveletId())
                        || IdUtil.isConversationalId(wavelet.getWaveletId())) {
                    for (String tag : wavelet.getTags()) {
                        addField(doc, IndexCondition.Field.TAG, tag);
                        addField(doc, IndexCondition.Field.CONTENT, tag);
                    }
                    addField(doc, IndexCondition.Field.CONTENT, textCollator.collateTextForWavelet(wavelet));
                }
            }
            addField(doc, IndexCondition.Field.CREATED, Long.toString(digest.getCreated()));
            addField(doc, IndexCondition.Field.LAST_MODIFIED, Long.toString(digest.getLastModified()));
            for (ParticipantId participantId : supplements.keySet()) {
                WaveDigestSupplement supplement = supplements.get(participantId);
                addField(doc, IndexCondition.Field.IN_,
                        IndexCondition.Field.IN_.toString() + participantId.getAddress(), supplement.getFolder());
                addField(doc, IndexCondition.Field.UNREAD_COUNT_,
                        IndexCondition.Field.UNREAD_COUNT_.toString() + participantId.getAddress(),
                        Integer.toString(supplement.getUnreadCount()));
            }
            LOG.fine("Write index for wave " + waveId.serialise());
            for (Fieldable field : doc.getFields()) {
                LOG.fine("  " + field.name() + " : " + field.stringValue());
            }
        }
    }

    private void addField(Document doc, IndexCondition.Field field, String value) {
        doc.add(new Field(field.toString(), value, field.isStored() ? Field.Store.YES : Field.Store.NO,
                field.isAnalyzed() ? Field.Index.ANALYZED_NO_NORMS : Field.Index.NOT_ANALYZED_NO_NORMS));
    }

    private void addField(Document doc, IndexCondition.Field field, String fieldName, String value) {
        doc.add(new Field(fieldName, value, field.isStored() ? Field.Store.YES : Field.Store.NO,
                field.isAnalyzed() ? Field.Index.ANALYZED_NO_NORMS : Field.Index.NOT_ANALYZED_NO_NORMS));
    }

    private void removeIndex(WaveId waveId) throws CorruptIndexException, IOException {
        nrtManager.deleteDocuments(new Term(IndexCondition.Field.WAVE_ID.toString(), waveId.serialise()));
    }

    @Override
    public SearchResult search(String query, int startAt, int numResults, ParticipantId viewer) {
        LOG.fine("Search query '" + query + "' from user: " + viewer + " [" + startAt + ", "
                + (startAt + numResults - 1) + "]");
        SearchResult result = new SearchResult(query);
        SearchQuery queryParams = queryParser.parseQuery(query);
        List<IndexCondition> indexConditions = SearchQueryHelper.convertToIndexQuery(queryParams, viewer,
                waveDomain);
        try {
            BooleanQuery allQuery = new BooleanQuery();
            String fromParticipant = viewer.getAddress();
            Query userQuery = null;
            if (!indexConditions.isEmpty()) {
                try {
                    userQuery = makeQuery(indexConditions);
                } catch (ParseException ex) {
                    LOG.log(Level.SEVERE, "Invalid query: " + query, ex);
                    return result;
                }
            }
            if (userQuery == null || !SearchQueryHelper.withParticipant(indexConditions, sharedDomainParticipant)) {
                TermQuery participantQuery = new TermQuery(
                        new Term(IndexCondition.Field.PARTICIPANTS.toString(), fromParticipant));
                participantQuery.setBoost(0);
                allQuery.add(participantQuery, Occur.MUST);
            }
            if (userQuery != null) {
                userQuery.setBoost(1);
                allQuery.add(userQuery, Occur.MUST);
                for (IndexCondition condition : indexConditions) {
                    if (condition.getField() == IndexCondition.Field.CONTENT) {
                        IndexCondition titleCondition = new IndexCondition(IndexCondition.Field.TITLE, null,
                                condition.getValue(), condition.isPhrase(), condition.isNot());
                        Query titleQuery = makeQuery(titleCondition);
                        titleQuery.setBoost(2);
                        allQuery.add(titleQuery, titleCondition.isNot() ? Occur.MUST_NOT : Occur.SHOULD);
                        IndexCondition tagCondition = new IndexCondition(IndexCondition.Field.TAG, null,
                                condition.getValue(), condition.isPhrase(), condition.isNot());
                        Query tagQuery = makeQuery(tagCondition);
                        tagQuery.setBoost(3);
                        allQuery.add(tagQuery, tagCondition.isNot() ? Occur.MUST_NOT : Occur.SHOULD);
                    }
                }
            }
            LOG.fine("Search query " + allQuery.toString());
            List<SortField> sortFields = new LinkedList<SortField>();
            sortFields.add(SortField.FIELD_SCORE);
            sortFields.add(new SortField(IndexCondition.Field.LAST_MODIFIED.toString(), longParser, true));
            SearcherManager searcherManager = nrtManager.getSearcherManager(true);
            IndexSearcher indexSearcher = searcherManager.acquire();
            try {
                indexSearcher.setSimilarity(similarity);
                TopDocs hints = indexSearcher.search(allQuery, startAt + numResults,
                        new Sort(sortFields.toArray(new SortField[sortFields.size()])));
                for (int i = startAt; i < hints.scoreDocs.length; i++) {
                    try {
                        ScoreDoc hint = hints.scoreDocs[i];
                        Document doc = indexSearcher.doc(hint.doc);
                        result.addDigest(parseDigest(doc, viewer));
                    } catch (IOException ex) {
                        LOG.log(Level.SEVERE, "Get digest from index", ex);
                    }
                }
            } finally {
                searcherManager.release(indexSearcher);
            }
        } catch (ParseException ex) {
            LOG.log(Level.SEVERE, "Search failed: " + query, ex);
        } catch (IOException ex) {
            LOG.log(Level.SEVERE, "Search failed: " + query, ex);
        }
        return result;
    }

    @Override
    public Digest findWave(WaveId waveId, ParticipantId viewer) {
        TermQuery query = new TermQuery(new Term(IndexCondition.Field.WAVE_ID.toString(), waveId.serialise()));
        SearcherManager searcherManager = nrtManager.getSearcherManager(true);
        IndexSearcher indexSearcher = searcherManager.acquire();
        try {
            TopDocs hints = indexSearcher.search(query, 1);
            if (hints.totalHits != 0) {
                ScoreDoc hint = hints.scoreDocs[0];
                return parseDigest(indexSearcher.doc(hint.doc), null);
            }
        } catch (IOException ex) {
            LOG.log(Level.SEVERE, "Search wave " + waveId.serialise() + " failed", ex);
        }
        return null;
    }

    private Digest parseDigest(Document doc, ParticipantId viewer) {
        String waveId = doc.get(IndexCondition.Field.WAVE_ID.toString());
        String title = doc.get(IndexCondition.Field.TITLE.toString());
        String snippet = doc.get(IndexCondition.Field.SNIPPET.toString());
        String creator = doc.get(IndexCondition.Field.CREATOR.toString());
        List<String> participants = new ArrayList<String>();
        for (String participant : doc.getValues(IndexCondition.Field.PARTICIPANTS.toString())) {
            participants.add(participant);
        }
        int blipCount = Integer.parseInt(doc.get(IndexCondition.Field.BLIP_COUNT.toString()));
        long created = Long.parseLong(doc.get(IndexCondition.Field.CREATED.toString()));
        long lastModified = Long.parseLong(doc.get(IndexCondition.Field.LAST_MODIFIED.toString()));
        WaveDigest digest = new WaveDigest(waveId, title, snippet, creator, participants, blipCount, created,
                lastModified);
        WaveDigestSupplement supplement = null;
        if (viewer != null) {
            String in = doc.get(IndexCondition.Field.IN_.toString() + viewer.getAddress());
            int unreadCount = blipCount;
            String unreadCountValue = doc.get(IndexCondition.Field.UNREAD_COUNT_.toString() + viewer.getAddress());
            // TODO remove after fix name
            if (unreadCountValue == null) {
                unreadCountValue = doc.get("readCount_" + viewer.getAddress());
            }
            if (unreadCountValue != null) {
                unreadCount = Integer.parseInt(unreadCountValue);
            }
            supplement = new WaveDigestSupplement(in, unreadCount);
        }
        return new Digest(digest, supplement);
    }

    private Query makeQuery(List<IndexCondition> conditions) throws ParseException {
        BooleanQuery query = new BooleanQuery();
        for (IndexCondition condition : conditions) {
            query.add(makeQuery(condition), condition.isNot() ? Occur.MUST_NOT : Occur.MUST);
        }
        return query;
    }

    private Query makeQuery(IndexCondition condition) throws ParseException {
        if (condition.getField().isAnalyzed()) {
            if (condition.isPhrase()) {
                Matcher m = Pattern.compile("\\S+").matcher(condition.getValue());
                PhraseQuery phraseQuery = new PhraseQuery();
                while (m.find()) {
                    for (String v : splitValue(m.group())) {
                        phraseQuery.add(makeTerm(condition, v));
                    }
                }
                return phraseQuery;
            } else {
                BooleanQuery booleanQuery = new BooleanQuery();
                for (String v : splitValue(condition.getValue())) {
                    TermQuery termQuery = new TermQuery(makeTerm(condition, v));
                    booleanQuery.add(termQuery, Occur.MUST);
                }
                return booleanQuery;
            }
        } else {
            return new TermQuery(makeTerm(condition, condition.getValue()));
        }
    }

    private Term makeTerm(IndexCondition condition, String value) {
        String fieldName = condition.getField().toString();
        if (condition.getFieldAddition() != null) {
            fieldName += condition.getFieldAddition();
        }
        return new Term(fieldName.toLowerCase(), value.toLowerCase());
    }

    private List<String> splitValue(String value) {
        List<String> values = Lists.newArrayList();
        StringBuilder v = new StringBuilder();
        for (Character c : value.toCharArray()) {
            if (Character.isLetter(c) || Character.isDigit(c)) {
                v.append(c);
            } else if (v.length() != 0) {
                values.add(v.toString());
                v = new StringBuilder();
            }
        }
        if (v.length() != 0) {
            values.add(v.toString());
        }
        return values;
    }
}