rapture.elasticsearch.ElasticSearchSearchRepository.java Source code

Introduction

Here is the source code for rapture.elasticsearch.ElasticSearchSearchRepository.java
Source

/**
 * The MIT License (MIT)
 *
 * Copyright (c) 2011-2016 Incapture Technologies LLC
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
package rapture.elasticsearch;

import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.tika.Tika;
import org.apache.tika.detect.EmptyDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.pdf.PDFParser;
import org.elasticsearch.action.ActionFuture;
import org.elasticsearch.action.admin.indices.refresh.RefreshRequest;
import org.elasticsearch.action.admin.indices.refresh.RefreshResponse;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;

import com.google.common.net.MediaType;

import rapture.common.AbstractUpdateObject;
import rapture.common.BlobContainer;
import rapture.common.BlobUpdateObject;
import rapture.common.ConnectionInfo;
import rapture.common.DocUpdateObject;
import rapture.common.RaptureURI;
import rapture.common.Scheme;
import rapture.common.connection.ConnectionType;
import rapture.common.exception.ExceptionToString;
import rapture.common.exception.RaptNotSupportedException;
import rapture.common.exception.RaptureExceptionFactory;
import rapture.common.impl.jackson.JacksonUtil;
import rapture.common.model.DocumentWithMeta;
import rapture.common.series.SeriesUpdateObject;
import rapture.config.ConfigLoader;
import rapture.kernel.ContextFactory;
import rapture.kernel.Kernel;
import rapture.kernel.search.SearchRepository;
import rapture.search.SearchRepoType;

/**
 * ElasticSearch implementation of our search repository. All ElasticSearch specific stuff should be found in here.
 * 
 * @author dukenguyen
 *
 */
public class ElasticSearchSearchRepository implements SearchRepository {

    private static final Logger log = Logger.getLogger(ElasticSearchSearchRepository.class);

    /*
     * how long to keep the cursor alive between paginated search queries, in milliseconds
     */
    private static final long CURSOR_KEEPALIVE = 600000;

    /*
     * Default number of times to retry on version confict for optimistic concurrency control, matches default number of pipeline threads
     */
    private static final int DEFAULT_RETRY_ON_CONFLICT = 50;

    /*
     * An ElasticSearch 'index' is akin to a database in SQL or a database in mongo
     */
    private String index;
    private String instanceName;
    private ConnectionInfo connectionInfo;
    private Client client = null;

    private Client ensureClient() {
        if (client == null) {
            start();
        }
        return client;
    }

    static Tika tikaPDF = new Tika(new EmptyDetector(), new PDFParser());

    @SuppressWarnings("rawtypes")
    @Override
    public void put(AbstractUpdateObject updateObject) {
        RaptureURI uri = updateObject.getUri();
        log.info("URI for indexing is " + uri.toString());
        putUriStore(uri);

        if (updateObject instanceof DocUpdateObject) {
            DocumentWithMeta docMeta = ((DocUpdateObject) updateObject).getPayload();
            ensureClient().prepareIndex(index, uri.getScheme().toString(), uri.toString())
                    .setSource(docMeta.getContent()).get();
            String meta = JacksonUtil.jsonFromObject(docMeta.getMetaData());
            ensureClient().prepareIndex(index, SearchRepoType.meta.toString(), uri.toString()).setSource(meta)
                    .get();
        } else if (updateObject instanceof SeriesUpdateObject) {
            Map<String, String> map = ((SeriesUpdateObject) updateObject).asStringMap();
            if (!map.isEmpty()) {
                synchronized (client) {
                    ensureClient().prepareUpdate(index, uri.getScheme().toString(), uri.toString()).setDoc(map)
                            .setUpsert(map).setRetryOnConflict(DEFAULT_RETRY_ON_CONFLICT).get();
                }
            }
        } else if (updateObject instanceof BlobUpdateObject) {
            BlobContainer content = ((BlobUpdateObject) updateObject).getPayload();
            /**
             * You have to give ElasticSearch a JSON document. We can determine what it is by checking updateObject.getMimeType() Do we need to index the blob's
             * mime type and any other metadata too?
             */

            // Tika can handle other types too, but at present all we really care about are blobs and CSVs
            if (updateObject.getMimeType().equals(MediaType.PDF.toString())) {
                // Tika can take a while so do it in the background
                new Thread() {
                    @SuppressWarnings("synthetic-access")
                    @Override
                    public void run() {
                        try {
                            String contentStr = tikaPDF
                                    .parseToString(new ByteArrayInputStream(content.getContent()));
                            XContentBuilder source = jsonBuilder().startObject().field("blob", contentStr)
                                    .endObject();
                            ensureClient().prepareIndex(index, Scheme.BLOB.toString(), uri.toString())
                                    .setSource(source).get();
                        } catch (IOException | TikaException e) {
                            log.error("Cannot index PDF " + e.getMessage());
                            log.debug(ExceptionToString.format(e));
                            throw RaptureExceptionFactory.create("Cannot index PDF " + e.getMessage(), e);
                        }
                    }
                }.start();
            } else {
                try {
                    XContentBuilder source = jsonBuilder().startObject()
                            .field("blob", new String(content.getContent())).endObject();
                    ensureClient().prepareIndex(index, Scheme.BLOB.toString(), uri.toString()).setSource(source)
                            .get();
                } catch (IOException ioe) {
                    log.error("Cannot index CSV " + ioe.getMessage());
                    log.debug(ExceptionToString.format(ioe));
                    throw RaptureExceptionFactory.create("Cannot index blob " + ioe.getMessage(), ioe);
                }
            }
            try {
                Map<String, String> headers = content.getHeaders();
                XContentBuilder source = jsonBuilder().startObject();
                if (headers != null) {
                    for (Entry<String, String> entry : headers.entrySet()) {
                        source.field(entry.getKey(), entry.getValue());
                    }
                }
                source.field("mimetype", updateObject.getMimeType()).endObject();
                ensureClient().prepareIndex(index, SearchRepoType.meta.toString(), uri.toString()).setSource(source)
                        .get();
            } catch (IOException ioe) {
                log.error("Cannot index blob metadata " + ioe.getMessage());
                log.debug(ExceptionToString.format(ioe));
                throw RaptureExceptionFactory.create("Cannot index blob " + ioe.getMessage(), ioe);
            }
        } else {
            throw new RaptNotSupportedException(
                    String.format("Search update for uri [%s] not supported yet", uri.toString()));
        }
    }

    private void putUriStore(RaptureURI uri) {
        SimpleURI uriStore = new SimpleURI();
        uriStore.setParts(Arrays.asList(uri.getDocPath().split("/")));
        uriStore.setRepo(uri.getAuthority());
        uriStore.setScheme(uri.getScheme().toString());
        ensureClient().prepareIndex(index, SearchRepoType.uri.toString(), uri.toString())
                .setSource(JacksonUtil.jsonFromObject(uriStore)).get();
    }

    /**
     * Remove this entry from elastic search
     */
    @Override
    public void remove(RaptureURI uri) {
        // delete from everywhere. it won't error if it's not there
        String id = uri.toString();
        ensureClient().prepareDelete(index, uri.getScheme().toString(), id).get();
        ensureClient().prepareDelete(index, SearchRepoType.meta.toString(), id).get();
        ensureClient().prepareDelete(index, SearchRepoType.uri.toString(), id).get();
    }

    @Override
    public void dropIndexForRepo(String repoName) {
        // Now the way we can find out the documents to delete is to do a search for "repo:repoName" which will return us the URI search hits
        // We delete those, and can replace URI with META and DOC to delete in the other tables too
        // ensureClient().prepare

        // So do a search with cursor, and page through it..., and do prepareDeletes (multideletes?) after extracting the displayeName
        // ideally on a worker thread, but for now, right here (as we should be on a pipeline thread)

    }

    // If the List is null, empty or only contains a single null or empty String then return all SearchRepoTypes
    private static String[] allTypes(List<String> types) {
        if ((types == null) || types.isEmpty() || ((types.size() == 1) && StringUtils.isEmpty(types.get(0))))
            return SearchRepoType.valueArray;
        return types.toArray(new String[types.size()]);
    }

    @Override
    public rapture.common.SearchResponse search(List<String> types, String query) {
        SearchResponse response = ensureClient().prepareSearch().setIndices(index).setTypes(allTypes(types))
                .setQuery(QueryBuilders.queryStringQuery(query)).get();
        return convert(response);
    }

    @Override
    public rapture.common.SearchResponse searchForRepoUris(String scheme, String repo, String cursorId) {
        String searchQuery = String.format("scheme:%s AND repo:%s", scheme, repo);
        return searchWithCursor(Arrays.asList(SearchRepoType.uri.toString()), cursorId, 10, searchQuery);
    }

    @Override
    public rapture.common.SearchResponse searchWithCursor(List<String> types, String cursorId, int size,
            String query) {
        SearchResponse response;
        if (StringUtils.isBlank(cursorId)) {
            response = ensureClient().prepareSearch().setQuery(QueryBuilders.queryStringQuery(query))
                    .setScroll(new TimeValue(CURSOR_KEEPALIVE)).setIndices(index).setTypes(allTypes(types))
                    .setSize(size).get();
        } else {
            response = ensureClient().prepareSearchScroll(cursorId).setScroll(new TimeValue(CURSOR_KEEPALIVE))
                    .get();
        }
        return convert(response);
    }

    @Override
    public void start() {
        getConnectionInfo();
        Map<String, String> s = new HashMap<>();
        s.put("client.transport.ignore_cluster_name",
                ConfigLoader.getConf().FullTextSearchIgnoreClusterName.toString());
        client = TransportClient.builder().settings(Settings.builder().put(s)).build();
        try {
            ((TransportClient) client).addTransportAddress(new InetSocketTransportAddress(
                    InetAddress.getByName(connectionInfo.getHost()), connectionInfo.getPort()));
            log.info(String.format("ElasticSearch connection configured to [%s:%d]", connectionInfo.getHost(),
                    connectionInfo.getPort()));
        } catch (UnknownHostException e) {
            log.error(e);
        }
    }

    void setIndex(String index) {
        this.index = index;
    }

    @Override
    public void setInstanceName(String instanceName) {
        this.instanceName = instanceName;
    }

    /**
     * For unit testing
     * 
     * @param client
     */
    void setClient(Client client) {
        this.client = client;
    }

    rapture.common.SearchResponse convert(SearchResponse response) {
        rapture.common.SearchResponse ret = new rapture.common.SearchResponse();
        ret.setCursorId(response.getScrollId());
        ret.setMaxScore(Double.parseDouble(Float.toString(response.getHits().getMaxScore())));
        ret.setTotal(response.getHits().getTotalHits());
        ret.setSearchHits(new ArrayList<>());
        for (SearchHit hit : response.getHits().getHits()) {
            rapture.common.SearchHit rHit = new rapture.common.SearchHit();
            rHit.setScore(Double.parseDouble(Float.toString(hit.getScore())));
            rHit.setSource(hit.getSourceAsString());
            rHit.setIndexType(hit.getType());
            rHit.setId(hit.getId());
            rHit.setUri(hit.getId());
            ret.getSearchHits().add(rHit);
        }
        return ret;
    }

    private void getConnectionInfo() {
        if (StringUtils.isBlank(instanceName)) {
            instanceName = "default";
        }

        Map<String, ConnectionInfo> map = Kernel.getSys().getConnectionInfo(ContextFactory.getKernelUser(),
                ConnectionType.ES.toString());
        connectionInfo = map.get(instanceName);
        if (connectionInfo == null) {
            throw RaptureExceptionFactory
                    .create("Elastic search for instance " + instanceName + " is not defined.");
        }
        index = connectionInfo.getDbName();
    }

    /**
     * Used for synchronous unit-testing, not to be used for regular code
     */
    RefreshResponse refresh() {
        ActionFuture<RefreshResponse> future = client.admin().indices().refresh(new RefreshRequest(index));
        return future.actionGet(1000);
    }

    @Override
    public void setConfig(Map<String, String> config) {
        setIndex(config.get("index"));
    }
}