cc.twittertools.search.api.TrecSearchHandler.java Source code

Java tutorial

Introduction

Here is the source code for cc.twittertools.search.api.TrecSearchHandler.java

Source

/**
 * Twitter Tools
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cc.twittertools.search.api;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;

import javax.annotation.Nullable;

import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeFilter;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.LMDirichletSimilarity;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import cc.twittertools.index.IndexStatuses;
import cc.twittertools.index.IndexStatuses.StatusField;
import cc.twittertools.thrift.gen.TQuery;
import cc.twittertools.thrift.gen.TResult;
import cc.twittertools.thrift.gen.TrecSearch;
import cc.twittertools.thrift.gen.TrecSearchException;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

public class TrecSearchHandler implements TrecSearch.Iface {
    private static final Logger LOG = Logger.getLogger(TrecSearchHandler.class);

    private static QueryParser QUERY_PARSER = new QueryParser(Version.LUCENE_43, StatusField.TEXT.name,
            IndexStatuses.ANALYZER);

    private final IndexSearcher searcher;
    private final Map<String, String> credentials;

    public TrecSearchHandler(File indexPath, @Nullable Map<String, String> credentials) throws IOException {
        Preconditions.checkNotNull(indexPath);
        Preconditions.checkArgument(indexPath.exists());

        // Can be null, in which case we don't check for credentials.
        this.credentials = credentials;

        IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath));
        searcher = new IndexSearcher(reader);
        searcher.setSimilarity(new LMDirichletSimilarity(2500.0f));
    }

    public List<TResult> search(TQuery query) throws TrecSearchException {
        Preconditions.checkNotNull(query);

        // Verify credentials.
        if (credentials != null
                && (!credentials.containsKey(query.group) || !credentials.get(query.group).equals(query.token))) {
            LOG.info(String.format("Access denied for (%s, %s)", query.group, query.token));
            throw new TrecSearchException("Invalid credentials: access denied.");
        }

        List<TResult> results = Lists.newArrayList();
        long startTime = System.currentTimeMillis();

        try {
            Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, query.max_id, true, true);

            Query q = QUERY_PARSER.parse(query.text);
            int num = query.num_results > 10000 ? 10000 : query.num_results;
            TopDocs rs = searcher.search(q, filter, num);
            for (ScoreDoc scoreDoc : rs.scoreDocs) {
                Document hit = searcher.doc(scoreDoc.doc);

                TResult p = new TResult();
                p.id = (Long) hit.getField(StatusField.ID.name).numericValue();
                p.screen_name = hit.get(StatusField.SCREEN_NAME.name);
                p.epoch = (Long) hit.getField(StatusField.EPOCH.name).numericValue();
                p.text = hit.get(StatusField.TEXT.name);
                p.rsv = scoreDoc.score;

                p.followers_count = (Integer) hit.getField(StatusField.FOLLOWERS_COUNT.name).numericValue();
                p.statuses_count = (Integer) hit.getField(StatusField.STATUSES_COUNT.name).numericValue();

                if (hit.get(StatusField.LANG.name) != null) {
                    p.lang = hit.get(StatusField.LANG.name);
                }

                if (hit.get(StatusField.IN_REPLY_TO_STATUS_ID.name) != null) {
                    p.in_reply_to_status_id = (Long) hit.getField(StatusField.IN_REPLY_TO_STATUS_ID.name)
                            .numericValue();
                }

                if (hit.get(StatusField.IN_REPLY_TO_USER_ID.name) != null) {
                    p.in_reply_to_user_id = (Long) hit.getField(StatusField.IN_REPLY_TO_USER_ID.name)
                            .numericValue();
                }

                if (hit.get(StatusField.RETWEETED_STATUS_ID.name) != null) {
                    p.retweeted_status_id = (Long) hit.getField(StatusField.RETWEETED_STATUS_ID.name)
                            .numericValue();
                }

                if (hit.get(StatusField.RETWEETED_USER_ID.name) != null) {
                    p.retweeted_user_id = (Long) hit.getField(StatusField.RETWEETED_USER_ID.name).numericValue();
                }

                if (hit.get(StatusField.RETWEET_COUNT.name) != null) {
                    p.retweeted_count = (Integer) hit.getField(StatusField.RETWEET_COUNT.name).numericValue();
                }

                results.add(p);
            }
        } catch (Exception e) {
            e.printStackTrace();
            throw new TrecSearchException(e.getMessage());
        }

        long endTime = System.currentTimeMillis();
        LOG.info(String.format("%4dms %s", (endTime - startTime), query.toString()));

        return results;
    }
}