org.lobid.lodmill.ElasticsearchIndexer.java Source code

Java tutorial

Introduction

Here is the source code for org.lobid.lodmill.ElasticsearchIndexer.java

Source

/* Copyright 2013-015 Fabian Steeg, Pascal Christoph, hbz. Licensed under the Eclipse Public License 1.0 */

package org.lobid.lodmill;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import java.util.SortedSet;
import java.util.concurrent.TimeUnit;

import org.culturegraph.mf.framework.DefaultObjectPipe;
import org.culturegraph.mf.framework.ObjectReceiver;
import org.culturegraph.mf.framework.annotations.In;
import org.culturegraph.mf.framework.annotations.Out;
import org.elasticsearch.action.admin.cluster.state.ClusterStateRequest;
import org.elasticsearch.action.admin.indices.delete.DeleteIndexRequest;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.update.UpdateRequest;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.IndicesAdminClient;
import org.elasticsearch.client.Requests;
import org.elasticsearch.client.transport.NoNodeAvailableException;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.ImmutableSettings.Builder;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Sets;
import com.google.common.collect.SortedSetMultimap;
import com.google.common.collect.TreeMultimap;
import com.google.common.io.CharStreams;

/**
 * Index JSON into elasticsearch.
 * 
 * @author Pascal Christoph (dr0i)
 * @author Fabian Steeg (fsteeg)
 */
@In(HashMap.class)
@Out(Void.class)
public class ElasticsearchIndexer extends DefaultObjectPipe<HashMap<String, String>, ObjectReceiver<Void>> {

    private static final Logger LOG = LoggerFactory.getLogger(ElasticsearchIndexer.class);
    private String hostname;
    private String clustername;
    private BulkRequestBuilder bulkRequest;
    private Builder CLIENT_SETTINGS;
    private InetSocketTransportAddress NODE;
    private TransportClient tc;
    private UpdateRequest updateRequest;
    private Client client;
    private int retries = 40;
    // collect so many documents before bulk indexing them all
    private int bulkSize = 5000;
    private int docs = 0;
    private String indexName;
    private boolean updateIndex;
    private String aliasSuffix;

    /**
     * Keys to get index properties and the json document ("graph")
     */
    @SuppressWarnings("javadoc")
    public static enum Properties {
        INDEX("_index"), TYPE("_type"), ID("_id"), PARENT("_parent"), GRAPH("graph");
        private final String name;

        Properties(final String name) {
            this.name = name;
        }

        public String getName() {
            return name;
        }
    }

    @Override
    protected void onCloseStream() {
        // feed the rest of the bulk
        bulkRequest.execute().actionGet();
        if (!aliasSuffix.equals("NOALIAS") && !updateIndex && !aliasSuffix.toLowerCase().contains("test"))
            updateAliases(indexName, aliasSuffix);
        bulkRequest.setRefresh(true).get();
    }

    // TODO use BulkProcessorbuilder by updating to ES 1.5
    @Override
    public void onSetReceiver() {
        if (client == null) {
            this.CLIENT_SETTINGS = ImmutableSettings.settingsBuilder().put("cluster.name", this.clustername);
            this.NODE = new InetSocketTransportAddress(this.hostname, 9300);
            this.tc = new TransportClient(this.CLIENT_SETTINGS.put("client.transport.sniff", false)
                    .put("client.transport.ping_timeout", 120, TimeUnit.SECONDS).build());
            this.client = this.tc.addTransportAddress(this.NODE);
        }
        bulkRequest = client.prepareBulk();
        if (updateIndex) {
            getNewestIndex();
        } else
            createIndex();
        bulkRequest.setRefresh(false);
    }

    @Override
    public void process(final HashMap<String, String> json) {
        updateRequest = new UpdateRequest(indexName, json.get(Properties.TYPE.getName()),
                json.get(Properties.ID.getName()));
        updateRequest.doc(json.get(Properties.GRAPH.getName()));
        updateRequest.docAsUpsert(true);
        if (json.containsKey(Properties.PARENT.getName())) {
            updateRequest.parent(json.get(Properties.PARENT.getName()));
        }
        bulkRequest.add(updateRequest);
        docs++;
        while (docs > bulkSize && retries > 0) {
            try {
                bulkRequest.execute().actionGet();
                docs = 0;
                bulkRequest = client.prepareBulk();
                bulkRequest.setRefresh(false);
                break; // stop retry-while
            } catch (final NoNodeAvailableException e) {
                retries--;
                try {
                    Thread.sleep(10000);
                } catch (final InterruptedException x) {
                    x.printStackTrace();
                }
                LOG.warn("Retry indexing record" + json.get(Properties.ID.getName()) + ":" + e.getMessage() + " ("
                        + retries + " more retries)");
            }
        }
    }

    /**
     * Sets the elasticsearch cluster name.
     * 
     * @param clustername the name of the cluster
     */
    public void setClustername(final String clustername) {
        this.clustername = clustername;
    }

    /**
     * Sets the elasticsearch hostname
     * 
     * @param hostname may be an IP or a domain name
     */
    public void setHostname(final String hostname) {
        this.hostname = hostname;
    }

    /**
     * Sets the elasticsearch index name
     * 
     * @param indexname name of the index
     */
    public void setIndexName(final String indexname) {
        this.indexName = indexname;
    }

    /**
     * Sets the suffix of elasticsearch index alias suffix
     * 
     * @param aliasSuffix may be an IP or a domain name
     */
    public void setIndexAliasSuffix(String aliasSuffix) {
        this.aliasSuffix = aliasSuffix;

    }

    /**
     * Sets the elasticsearch client.
     * 
     * @param client the elasticsearch client
     */
    public void setElasticsearchClient(Client client) {
        this.client = client;
    }

    /**
     * Sets the elasticsearch index name
     * 
     * @param updateIndex name of the index
     */
    public void setUpdateNewestIndex(final boolean updateIndex) {
        this.updateIndex = updateIndex;
    }

    private void getNewestIndex() {
        String indexNameWithoutTimestamp = indexName.replaceAll("20.*", "");
        final SortedSetMultimap<String, String> indices = groupByIndexCollection(indexName);
        for (String prefix : indices.keySet()) {
            final SortedSet<String> indicesForPrefix = indices.get(prefix);
            final String newestIndex = indicesForPrefix.last();
            if (newestIndex.startsWith(indexNameWithoutTimestamp))
                indexName = newestIndex;
        }
        LOG.info("Going to UPDATE existing index " + indexName);
    }

    private void createIndex() {
        IndicesAdminClient adminClient = client.admin().indices();
        if (!adminClient.prepareExists(indexName).execute().actionGet().isExists()) {
            LOG.info("Going to CREATE new index " + indexName);
            adminClient.prepareCreate(indexName).setSource(config()).execute()

                    .actionGet();
        } else
            LOG.info("Index already exists, going to UPDATE index " + indexName);
    }

    private static String config() {
        String res = null;
        try {
            final InputStream config = Thread.currentThread().getContextClassLoader()
                    .getResourceAsStream("index-config.json");
            try (InputStreamReader reader = new InputStreamReader(config, "UTF-8")) {
                res = CharStreams.toString(reader);
            }
        } catch (IOException e) {
            LOG.error(e.getMessage(), e);
        }
        return res;
    }

    private void updateAliases(final String name, final String suffix) {
        final SortedSetMultimap<String, String> indices = groupByIndexCollection(name);
        for (String prefix : indices.keySet()) {
            final SortedSet<String> indicesForPrefix = indices.get(prefix);
            final String newIndex = indicesForPrefix.last();
            final String newAlias = prefix + suffix;
            LOG.info("Prefix " + prefix + ", newest index: " + newIndex);
            removeOldAliases(indicesForPrefix, newAlias);
            if (!name.equals(newAlias) && !newIndex.equals(newAlias))
                createNewAlias(newIndex, newAlias);
            deleteOldIndices(name, indicesForPrefix);
        }
    }

    private SortedSetMultimap<String, String> groupByIndexCollection(final String name) {
        final SortedSetMultimap<String, String> indices = TreeMultimap.create();
        for (String index : client.admin().indices().prepareStats().execute().actionGet().getIndices().keySet()) {
            final String[] nameAndTimestamp = index.split("-(?=\\d)");
            if (name.startsWith(nameAndTimestamp[0]))
                indices.put(nameAndTimestamp[0], index);
        }
        return indices;
    }

    private void removeOldAliases(final SortedSet<String> indicesForPrefix, final String newAlias) {
        for (String name : indicesForPrefix) {
            final Set<String> aliases = aliases(name);
            for (String alias : aliases) {
                if (alias.equals(newAlias)) {
                    LOG.info("Delete alias index,alias: " + name + "," + alias);
                    client.admin().indices().prepareAliases().removeAlias(name, alias).execute().actionGet();
                }
            }
        }
    }

    private void createNewAlias(final String newIndex, final String newAlias) {
        LOG.info("Create alias index,alias: " + newIndex + "," + newAlias);
        client.admin().indices().prepareAliases().addAlias(newIndex, newAlias).execute().actionGet();
    }

    private void deleteOldIndices(final String name, final SortedSet<String> allIndices) {
        if (allIndices.size() >= 3) {
            final List<String> list = new ArrayList<>(allIndices);
            list.remove(name);
            for (String indexToDelete : list.subList(0, list.size() - 2)) {
                if (aliases(indexToDelete).isEmpty()) {
                    LOG.info("Deleting index: " + indexToDelete);
                    client.admin().indices().delete(new DeleteIndexRequest(indexToDelete)).actionGet();
                }
            }
        }
    }

    private Set<String> aliases(final String name) {
        final ClusterStateRequest clusterStateRequest = Requests.clusterStateRequest().nodes(true).indices(name);
        return Sets.newHashSet(client.admin().cluster().state(clusterStateRequest).actionGet().getState()
                .getMetaData().aliases().keysIt());
    }

}