org.apache.jena.query.text.es.TextIndexES.java Source code

Introduction

Here is the source code for org.apache.jena.query.text.es.TextIndexES.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.query.text.es;

import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.jena.graph.Node;
import org.apache.jena.graph.NodeFactory;
import org.apache.jena.query.text.*;
import org.apache.jena.sparql.util.NodeFactoryExtra;
import org.apache.lucene.queryparser.classic.QueryParserBase;
import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsRequest;
import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsResponse;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.update.UpdateRequest;
import org.elasticsearch.action.update.UpdateResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.engine.DocumentMissingException;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.script.Script;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.net.InetAddress;
import java.util.*;

import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;

/**
 * Elastic Search Implementation of {@link TextIndex}
 *
 */
public class TextIndexES implements TextIndex {

    /**
     * The definition of the Entity we are trying to Index
     */
    private final EntityDefinition docDef;

    /**
     * Thread safe ElasticSearch Java Client to perform Index operations
     */
    private static Client client;

    /**
     * The name of the index. Defaults to 'jena-text'
     */
    private final String indexName;

    /**
     * The parameter representing the cluster name key
     */
    static final String CLUSTER_NAME_PARAM = "cluster.name";

    /**
     * The parameter representing the number of shards key
     */
    static final String NUM_OF_SHARDS_PARAM = "number_of_shards";

    /**
     * The parameter representing the number of replicas key
     */
    static final String NUM_OF_REPLICAS_PARAM = "number_of_replicas";

    private static final String DASH = "-";

    private static final String UNDERSCORE = "_";

    private static final String COLON = ":";

    private static final String ASTERISK = "*";

    /**
     * ES Script for adding/updating the document in the index.
     * The main reason to use scripts is because we want to modify the values of the fields that contains an array of values
     */
    private static final String ADD_UPDATE_SCRIPT = "if((ctx._source == null) || (ctx._source.<fieldName> == null) || (ctx._source.<fieldName>.empty == true)) "
            + "{ctx._source.<fieldName>=[params.fieldValue] } else {ctx._source.<fieldName>.add(params.fieldValue)}";

    /**
     * ES Script for deleting a specific value in the field for the given document in the index.
     * The main reason to use scripts is because we want to delete specific value of the field that contains an array of values
     */
    private static final String DELETE_SCRIPT = "if((ctx._source != null) && (ctx._source.<fieldToRemove> != null) && (ctx._source.<fieldToRemove>.empty != true) "
            + "&& (ctx._source.<fieldToRemove>.indexOf(params.valueToRemove) >= 0)) "
            + "{ctx._source.<fieldToRemove>.remove(ctx._source.<fieldToRemove>.indexOf(params.valueToRemove))}";

    /**
     * Number of maximum results to return in case no limit is specified on the search operation
     */
    static final Integer MAX_RESULTS = 10000;

    private static final Logger LOGGER = LoggerFactory.getLogger(TextIndexES.class);

    /**
     * Construct an instance of {@link TextIndexES} based on provided {@link TextIndexConfig} and {@link ESSettings}
     * The constructor is responsible for initializing a {@link TransportClient} based on the provided configs
     * and create index based on the provided {@link ESSettings}
     * @param config an instance of {@link TextIndexConfig}
     * @param esSettings an instance of {@link ESSettings}
     */
    public TextIndexES(TextIndexConfig config, ESSettings esSettings) {

        this.indexName = esSettings.getIndexName();
        this.docDef = config.getEntDef();
        docDef.setLangField("lang");

        try {
            if (client == null) {

                LOGGER.debug("Initializing the Elastic Search Java Client with settings: " + esSettings);
                Settings settings = Settings.builder().put(CLUSTER_NAME_PARAM, esSettings.getClusterName()).build();
                List<TransportAddress> addresses = new ArrayList<>();
                for (String host : esSettings.getHostToPortMapping().keySet()) {
                    TransportAddress addr = new TransportAddress(InetAddress.getByName(host),
                            esSettings.getHostToPortMapping().get(host));
                    addresses.add(addr);
                }

                TransportAddress socketAddresses[] = new TransportAddress[addresses.size()];
                TransportClient tc = new PreBuiltTransportClient(settings);
                tc.addTransportAddresses(addresses.toArray(socketAddresses));
                client = tc;
                LOGGER.debug("Successfully initialized the client");
            }

            IndicesExistsResponse exists = client.admin().indices().exists(new IndicesExistsRequest(indexName))
                    .get();
            if (!exists.isExists()) {
                Settings indexSettings = Settings.builder().put(NUM_OF_SHARDS_PARAM, esSettings.getShards())
                        .put(NUM_OF_REPLICAS_PARAM, esSettings.getReplicas()).build();
                LOGGER.debug("Index with name " + indexName + " does not exist yet. Creating one with settings: "
                        + indexSettings.toString());
                client.admin().indices().prepareCreate(indexName).setSettings(indexSettings).get();
            }
        } catch (Exception e) {
            throw new TextIndexException("Exception occurred while instantiating ElasticSearch Text Index", e);
        }
    }

    /**
     * Constructor used mainly for performing Integration tests
     * @param config an instance of {@link TextIndexConfig}
     * @param client an instance of {@link TransportClient}. The client should already have been initialized with an index
     */
    public TextIndexES(TextIndexConfig config, Client client, String indexName) {
        this.docDef = config.getEntDef();
        TextIndexES.client = client;
        this.indexName = indexName;
    }

    /**
     * We do not have any specific logic to perform before committing
     */
    @Override
    public void prepareCommit() {
        //Do Nothing

    }

    /**
     * Commit happens in the individual get/add/delete operations
     */
    @Override
    public void commit() {
        // Do Nothing
    }

    /**
     * We do not do rollback
     */
    @Override
    public void rollback() {
        //Do Nothing

    }

    /**
     * We don't have resources that need to be closed explicitly
     */
    @Override
    public void close() {
        // Do Nothing

    }

    /**
     * Update an Entity. Since we are doing Upserts in add entity anyway, we simply call {@link #addEntity(Entity)}
     * method that takes care of updating the Entity as well.
     * @param entity the entity to update.
     */
    @Override
    public void updateEntity(Entity entity) {
        //Since Add entity also updates the indexed document in case it already exists,
        // we can simply call the addEntity from here.
        addEntity(entity);
    }

    /**
     * Add an Entity to the ElasticSearch Index.
     * The entity will be added as a new document in ES, if it does not already exists.
     * If the Entity exists, then the entity will simply be updated.
     * The entity will never be replaced.
     * @param entity the entity to add
     */
    @Override
    public void addEntity(Entity entity) {
        LOGGER.debug("Adding/Updating the entity in ES");

        //The field that has a not null value in the current Entity instance.
        //Required, mainly for building a script for the update command.
        String fieldToAdd = null;
        String fieldValueToAdd = null;
        try {
            XContentBuilder builder = jsonBuilder().startObject();

            for (String field : docDef.fields()) {
                if (entity.get(field) != null) {
                    if (entity.getLanguage() != null && !entity.getLanguage().isEmpty()) {
                        //We make sure that the field name contains all underscore and no dash (for eg. when the lang value is en-GB)
                        //The reason to do this is because the script fails with exception in case we have "-" in field name.
                        fieldToAdd = normalizeFieldName(field, entity.getLanguage());
                    } else {
                        fieldToAdd = field;
                    }

                    fieldValueToAdd = (String) entity.get(field);
                    builder = builder.field(fieldToAdd, Arrays.asList(fieldValueToAdd));
                    break;
                } else {
                    //We are making sure that the field is at-least added to the index.
                    //This will help us tremendously when we are appending the data later in an already indexed document.
                    builder = builder.field(field, Collections.emptyList());
                }
            }

            builder = builder.endObject();
            IndexRequest indexRequest = new IndexRequest(indexName, docDef.getEntityField(), entity.getId())
                    .source(builder);

            String addUpdateScript = ADD_UPDATE_SCRIPT.replaceAll("<fieldName>", fieldToAdd);
            Map<String, Object> params = new HashMap<>();
            params.put("fieldValue", fieldValueToAdd);

            UpdateRequest upReq = new UpdateRequest(indexName, docDef.getEntityField(), entity.getId()).script(
                    new Script(Script.DEFAULT_SCRIPT_TYPE, Script.DEFAULT_SCRIPT_LANG, addUpdateScript, params))
                    .upsert(indexRequest);

            UpdateResponse response = client.update(upReq).get();

            LOGGER.debug("Received the following Update response : " + response + " for the following entity: "
                    + entity);

        } catch (Exception e) {
            throw new TextIndexException("Unable to Index the Entity in ElasticSearch.", e);
        }
    }

    /**
     * Delete the value of the entity from the existing document, if any.
     * The document itself will never get deleted. Only the value will get deleted.
     * @param entity entity whose value needs to be deleted
     */
    @Override
    public void deleteEntity(Entity entity) {

        String fieldToRemove = null;
        String valueToRemove = null;
        for (String field : docDef.fields()) {
            if (entity.get(field) != null) {
                fieldToRemove = field;
                if (entity.getLanguage() != null && !entity.getLanguage().isEmpty()) {
                    fieldToRemove = normalizeFieldName(fieldToRemove, entity.getLanguage());
                }
                valueToRemove = (String) entity.get(field);
                break;
            }
        }

        if (fieldToRemove != null && valueToRemove != null) {

            LOGGER.debug("deleting content related to entity: " + entity.getId());
            String deleteScript = DELETE_SCRIPT.replaceAll("<fieldToRemove>", fieldToRemove);
            Map<String, Object> params = new HashMap<>();
            params.put("valueToRemove", valueToRemove);

            UpdateRequest updateRequest = new UpdateRequest(indexName, docDef.getEntityField(), entity.getId())
                    .script(new Script(Script.DEFAULT_SCRIPT_TYPE, Script.DEFAULT_SCRIPT_LANG, deleteScript,
                            params));

            try {
                client.update(updateRequest).get();
            } catch (Exception e) {
                if (ExceptionUtils.getRootCause(e) instanceof DocumentMissingException) {
                    LOGGER.debug("Trying to delete values from a missing document. Ignoring deletion of entity: ",
                            entity);
                } else {
                    throw new TextIndexException("Unable to delete entity.", e);
                }
            }
        }
    }

    /**
     * Get an Entity given the subject Id
     * @param uri the subject Id of the entity
     * @return a map of field name and field values;
     */
    @Override
    public Map<String, Node> get(String uri) {

        GetResponse response;
        Map<String, Node> result = new HashMap<>();

        if (uri != null) {
            response = client.prepareGet(indexName, docDef.getEntityField(), uri).get();
            if (response != null && !response.isSourceEmpty()) {
                String entityField = response.getId();
                Node entity = NodeFactory.createURI(entityField);
                result.put(docDef.getEntityField(), entity);
                Map<String, Object> source = response.getSource();
                for (String field : docDef.fields()) {
                    Object fieldResponse = source.get(field);

                    if (fieldResponse == null) {
                        //We wont return it.
                        continue;
                    } else if (fieldResponse instanceof List<?>) {
                        //We are storing the values of fields as a List always.
                        //If there are values stored in the list, then we return the first value,
                        // else we do not include the field in the returned Map of Field -> Node Mapping
                        List<?> responseList = (List<?>) fieldResponse;
                        if (responseList != null && responseList.size() > 0) {
                            String fieldValue = (String) responseList.get(0);
                            Node fieldNode = NodeFactoryExtra.createLiteralNode(fieldValue, null, null);
                            result.put(field, fieldNode);
                        }
                    }
                }
            }
        }

        return result;
    }

    @Override
    public List<TextHit> query(Node property, String qs, String graphURI, String lang) {
        return query(property, qs, graphURI, lang, MAX_RESULTS);
    }

    @Override
    public List<TextHit> query(Node property, String qs, String graphURI, String lang, int limit,
            String highlight) {
        return query(property, qs, graphURI, lang, limit);
    }

    /**
     * Query the ElasticSearch for the given Node, with the given query String and limit.
     * @param property the node property to make a search for
     * @param qs the query string
     * @param limit limit on the number of records to return
     * @return List of {@link TextHit}s containing the documents that have been found
     */
    @Override
    public List<TextHit> query(Node property, String qs, String graphURI, String lang, int limit) {
        if (property != null) {
            qs = parse(property.getLocalName(), qs, lang);
        } else {
            qs = parse(null, qs, lang);
        }

        LOGGER.debug("Querying ElasticSearch for QueryString: " + qs);
        SearchResponse response = client.prepareSearch(indexName).setTypes(docDef.getEntityField())
                .setQuery(QueryBuilders.queryStringQuery(qs))
                // Not fetching the source because we are currently not interested
                // in the actual values but only Id of the document. This will also speed up search
                .setFetchSource(false).setFrom(0).setSize(limit).get();

        List<TextHit> results = new ArrayList<>();
        for (SearchHit hit : response.getHits()) {

            //It has been decided to return NULL literal values for now.
            String entityField = hit.getId();
            Node entityNode = TextQueryFuncs.stringToNode(entityField);
            Float score = hit.getScore();
            TextHit textHit = new TextHit(entityNode, score, null);
            results.add(textHit);

        }
        return results;
    }

    @Override
    public EntityDefinition getDocDef() {
        return docDef;
    }

    private String parse(String fieldName, String qs, String lang) {
        //Escape special characters if any in the query string
        qs = QueryParserBase.escape(qs);

        if (fieldName != null && !fieldName.isEmpty()) {
            if (lang != null && !lang.equals("none")) {
                if (!ASTERISK.equals(lang)) {
                    fieldName = fieldName + UNDERSCORE + lang.replaceAll(DASH, UNDERSCORE);
                    qs = fieldName + COLON + qs;
                } else {
                    if (!qs.contains("\\*")) {
                        fieldName = fieldName + ASTERISK;
                        qs = fieldName + COLON + qs;
                    }
                }

            } else {
                //Lang is null, but field name is not null
                qs = fieldName + COLON + qs;

            }
        }
        //We do this to enable wild card search
        return qs.replaceAll("\\*", "\\\\*");

    }

    private String normalizeFieldName(String fieldName, String lang) {
        //We know that the lang field is not null already
        StringBuilder sb = new StringBuilder(fieldName);
        return sb.append(UNDERSCORE).append(lang.replaceAll(DASH, UNDERSCORE)).toString();

    }

}