com.ikanow.aleph2.analytics.services.DeduplicationService.java Source code

Java tutorial

Introduction

Here is the source code for com.ikanow.aleph2.analytics.services.DeduplicationService.java

Source

/*******************************************************************************
 * Copyright 2015, The IKANOW Open Source Project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package com.ikanow.aleph2.analytics.services;

import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.apache.logging.log4j.Level;

import scala.Tuple2;
import scala.Tuple3;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.ikanow.aleph2.analytics.data_model.DedupConfigBean;
import com.ikanow.aleph2.analytics.utils.ErrorUtils;
import com.ikanow.aleph2.core.shared.utils.BatchRecordUtils;
import com.ikanow.aleph2.core.shared.utils.DataServiceUtils;
import com.ikanow.aleph2.data_model.interfaces.data_analytics.IBatchRecord;
import com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentBatchModule;
import com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext;
import com.ikanow.aleph2.data_model.interfaces.data_services.IDocumentService;
import com.ikanow.aleph2.data_model.interfaces.shared_services.IBucketLogger;
import com.ikanow.aleph2.data_model.interfaces.shared_services.ICrudService;
import com.ikanow.aleph2.data_model.interfaces.shared_services.IDataServiceProvider;
import com.ikanow.aleph2.data_model.interfaces.shared_services.ILoggingService;
import com.ikanow.aleph2.data_model.interfaces.shared_services.IUnderlyingService;
import com.ikanow.aleph2.data_model.objects.data_import.AnnotationBean;
import com.ikanow.aleph2.data_model.objects.data_import.DataBucketBean;
import com.ikanow.aleph2.data_model.objects.data_import.DataSchemaBean.DocumentSchemaBean;
import com.ikanow.aleph2.data_model.objects.data_import.DataSchemaBean.DocumentSchemaBean.CustomPolicy;
import com.ikanow.aleph2.data_model.objects.data_import.DataSchemaBean.DocumentSchemaBean.DeduplicationPolicy;
import com.ikanow.aleph2.data_model.objects.data_import.DataSchemaBean.DocumentSchemaBean.DeduplicationTiming;
import com.ikanow.aleph2.data_model.objects.data_import.EnrichmentControlMetadataBean;
import com.ikanow.aleph2.data_model.objects.shared.AuthorizationBean;
import com.ikanow.aleph2.data_model.objects.shared.BasicMessageBean;
import com.ikanow.aleph2.data_model.objects.shared.SharedLibraryBean;
import com.ikanow.aleph2.data_model.utils.BeanTemplateUtils;
import com.ikanow.aleph2.data_model.utils.BeanTemplateUtils.MethodNamingHelper;
import com.ikanow.aleph2.data_model.utils.BucketUtils;
import com.ikanow.aleph2.data_model.utils.CrudUtils;
import com.ikanow.aleph2.data_model.utils.Lambdas;
import com.ikanow.aleph2.data_model.utils.Optionals;
import com.ikanow.aleph2.data_model.utils.Patterns;
import com.ikanow.aleph2.data_model.utils.TimeUtils;
import com.ikanow.aleph2.data_model.utils.Tuples;
import com.ikanow.aleph2.data_model.utils.CrudUtils.QueryComponent;
import com.ikanow.aleph2.data_model.utils.JsonUtils;
import com.ikanow.aleph2.data_model.utils.SetOnce;

import fj.data.Either;
import fj.data.Validation;

/** An enrichment module that will perform deduplication using the provided document_schema
 * @author Alex
 */
public class DeduplicationService implements IEnrichmentBatchModule {
    protected final static ObjectMapper _mapper = BeanTemplateUtils.configureMapper(Optional.empty());
    protected final static MethodNamingHelper<AnnotationBean> _annot = BeanTemplateUtils.from(AnnotationBean.class);

    protected final SetOnce<ICrudService<JsonNode>> _dedup_context = new SetOnce<>();
    protected final SetOnce<IEnrichmentModuleContext> _context = new SetOnce<>();
    protected final SetOnce<DocumentSchemaBean> _doc_schema = new SetOnce<>();
    protected final SetOnce<String> _timestamp_field = new SetOnce<>();
    protected final SetOnce<Boolean> _deduplication_is_disabled = new SetOnce<>();
    protected final SetOnce<Boolean> _is_system_dedup_stage = new SetOnce<>(); // if executes the system level deduplication or if is part of the app logic (affects logging)

    protected final SetOnce<IEnrichmentBatchModule> _custom_handler = new SetOnce<>();
    protected final SetOnce<DeduplicationEnrichmentContext> _custom_context = new SetOnce<>();
    protected final SetOnce<EnrichmentControlMetadataBean> _control = new SetOnce<>();

    protected final SetOnce<List<String>> _dedup_fields = new SetOnce<>();
    protected final SetOnce<DeduplicationPolicy> _policy = new SetOnce<>();

    protected final SetOnce<IBucketLogger> _logger = new SetOnce<>();

    public static class MutableStats {
        int nonduplicate_keys = 0;
        int duplicates_incoming = 0;
        int duplicates_existing = 0;
        int duplicate_keys = 0;
        int deleted = 0;
    }

    protected final MutableStats _mutable_stats = new MutableStats();

    protected final LinkedList<CompletableFuture<Long>> mutable_uncompleted_deletes = new LinkedList<>();

    //TODO (ALEPH-20): move this into the ES service
    public static class ElasticsearchTechnologyOverride {
        protected ElasticsearchTechnologyOverride() {
        }

        public ElasticsearchTechnologyOverride(final String default_modifier,
                final Map<String, String> field_override) {
            this.default_modifier = default_modifier;
            this.field_override = field_override;
        }

        private String default_modifier;
        private Map<String, String> field_override;

        public final String default_modifier() {
            return Optional.ofNullable(default_modifier).orElse("");
        }

        public final Map<String, String> field_override() {
            return Optional.ofNullable(field_override).orElse(Collections.emptyMap());
        }
    }

    protected final SetOnce<Function<String, String>> _db_mapper = new SetOnce<>();

    /* (non-Javadoc)
     * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentBatchModule#validateModule(com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext, com.ikanow.aleph2.data_model.objects.data_import.DataBucketBean, com.ikanow.aleph2.data_model.objects.data_import.EnrichmentControlMetadataBean)
     */
    @Override
    public Collection<BasicMessageBean> validateModule(IEnrichmentModuleContext context, DataBucketBean bucket,
            EnrichmentControlMetadataBean control) {

        final LinkedList<BasicMessageBean> mutable_errs = new LinkedList<>();

        // Validation

        // 1) Check that has doc schema enabled unless override set 

        final DedupConfigBean dedup_config = BeanTemplateUtils
                .from(Optional.ofNullable(control.config()).orElse(Collections.emptyMap()), DedupConfigBean.class)
                .get();

        final DocumentSchemaBean doc_schema = Optional.ofNullable(dedup_config.doc_schema_override())
                .orElse(bucket.data_schema().document_schema()); //(exists by construction)
        if (null == doc_schema) { // Has to either have a doc schema or an override 
            mutable_errs.add(ErrorUtils.buildErrorMessage(this.getClass().getSimpleName(), "validateModule",
                    ErrorUtils.get(ErrorUtils.MISSING_DOCUMENT_SERVICE)));
            return mutable_errs; //(no point going any further here)
        }

        if (!Optional.ofNullable(doc_schema.lookup_service_override()).filter(s -> !s.isEmpty()).isPresent()) {
            final boolean doc_schema_enabled = Optionals.of(() -> bucket.data_schema().document_schema())
                    .map(ds -> Optional.ofNullable(ds.enabled()).orElse(true)).orElse(false);
            if (!doc_schema_enabled) {
                mutable_errs.add(ErrorUtils.buildErrorMessage(this.getClass().getSimpleName(), "validateModule",
                        ErrorUtils.get(ErrorUtils.MISSING_DOCUMENT_SERVICE)));
            }
        }
        //(else up to the user to ensure that the required service is included)

        // 1.5) Validate that the service override is valid

        final Validation<String, Tuple2<Optional<Class<? extends IUnderlyingService>>, Optional<String>>> service_to_use = getDataService(
                doc_schema);

        if (service_to_use.isFail()) {
            mutable_errs.add(ErrorUtils.buildErrorMessage(this.getClass().getSimpleName(), "validateModule",
                    service_to_use.fail()));
        }

        // 2) Validate any child modules

        Optional<EnrichmentControlMetadataBean> custom_config = Optionals
                .ofNullable(doc_schema.custom_deduplication_configs()).stream()
                .filter(cfg -> Optional.ofNullable(cfg.enabled()).orElse(true)).findFirst();

        custom_config.ifPresent(cfg -> {
            mutable_errs.addAll(getEnrichmentModules(context, cfg).stream()
                    .flatMap(module -> module.validateModule(context, bucket, cfg).stream())
                    .collect(Collectors.toList()));
        });
        return mutable_errs;
    }

    /* (non-Javadoc)
     * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentBatchModule#onStageInitialize(com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentModuleContext, com.ikanow.aleph2.data_model.objects.data_import.DataBucketBean, com.ikanow.aleph2.data_model.objects.data_import.EnrichmentControlMetadataBean, scala.Tuple2, java.util.Optional)
     */
    @Override
    public void onStageInitialize(final IEnrichmentModuleContext context, final DataBucketBean bucket,
            final EnrichmentControlMetadataBean dedup_control,
            final Tuple2<ProcessingStage, ProcessingStage> previous_next,
            final Optional<List<String>> next_grouping_fields) {
        _context.set(context);
        _control.set(dedup_control);

        context.getServiceContext().getService(ILoggingService.class, Optional.empty())
                .map(s -> s.getSystemLogger(bucket)).ifPresent(logger -> _logger.set(logger));

        final DedupConfigBean dedup_config = BeanTemplateUtils
                .from(Optional.ofNullable(dedup_control.config()).orElse(Collections.emptyMap()),
                        DedupConfigBean.class)
                .get();

        final DocumentSchemaBean doc_schema = Optional.ofNullable(dedup_config.doc_schema_override())
                .orElse(bucket.data_schema().document_schema()); //(exists by construction)
        _is_system_dedup_stage.set(null == dedup_config.doc_schema_override());

        _deduplication_is_disabled.set((null == doc_schema.deduplication_policy())
                && Optionals.ofNullable(doc_schema.deduplication_fields()).isEmpty()
                && Optionals.ofNullable(doc_schema.deduplication_contexts()).isEmpty());

        // override defaults:
        _doc_schema.set(BeanTemplateUtils.clone(doc_schema)
                .with(DocumentSchemaBean::enabled, Optional.ofNullable(doc_schema.enabled()).orElse(true))
                .with(DocumentSchemaBean::deduplication_timing,
                        Optional.ofNullable(doc_schema.deduplication_timing()).orElse(DeduplicationTiming.custom))
                .with(DocumentSchemaBean::deduplication_policy,
                        Optional.ofNullable(doc_schema.deduplication_policy()).orElse(DeduplicationPolicy.leave))
                .with(DocumentSchemaBean::custom_policy,
                        Optional.ofNullable(doc_schema.custom_policy()).orElse(CustomPolicy.strict))
                .with(DocumentSchemaBean::custom_finalize_all_objects,
                        Optional.ofNullable(doc_schema.custom_finalize_all_objects()).orElse(false))
                .with(DocumentSchemaBean::delete_unhandled_duplicates,
                        Optional.ofNullable(doc_schema.delete_unhandled_duplicates())
                                .orElse(CustomPolicy.very_strict == Optional.ofNullable(doc_schema.custom_policy())
                                        .orElse(CustomPolicy.strict)))
                .with(DocumentSchemaBean::allow_manual_deletion,
                        Optional.ofNullable(doc_schema.allow_manual_deletion()).orElse(false))
                .done());

        final String timestamp_field = Optionals.of(() -> bucket.data_schema().temporal_schema().time_field())
                .orElseGet(() -> AnnotationBean.ROOT_PATH + ".tp");
        _timestamp_field.set(timestamp_field);

        final DataBucketBean context_holder = Optional.ofNullable(doc_schema.deduplication_contexts())
                .filter(l -> !l.isEmpty()) // (if empty or null then fall back to...)            
                .map(contexts -> BeanTemplateUtils.build(DataBucketBean.class)
                        .with(DataBucketBean::multi_bucket_children,
                                ImmutableSet.<String>builder().addAll(contexts).build())
                        .done().get())
                .orElse(bucket);

        //(valid by construction - see validateSchema)
        final Validation<String, Tuple2<Optional<Class<? extends IUnderlyingService>>, Optional<String>>> service_to_use = getDataService(
                _doc_schema.get());

        // Get secured data service -> CRUD for id checking and deletion
        final Optional<ICrudService<JsonNode>> maybe_read_crud = context.getServiceContext()
                .getService(service_to_use.success()._1().get(), service_to_use.success()._2())
                .map(ds -> ((IDataServiceProvider) ds)
                        .secured(context.getServiceContext(), new AuthorizationBean(bucket.owner_id())))
                .flatMap(ds -> ds.getDataService())
                .flatMap(ds -> _doc_schema.get().delete_unhandled_duplicates()
                        || _doc_schema.get().allow_manual_deletion()
                                ? ds.getUpdatableCrudService(JsonNode.class, Arrays.asList(context_holder),
                                        Optional.empty())
                                : ds.getReadableCrudService(JsonNode.class, Arrays.asList(context_holder),
                                        Optional.empty()).map(crud -> (ICrudService<JsonNode>) crud))
        //(just ensure it has a read/update interface even though the update might not be used)
        ;

        maybe_read_crud.ifPresent(read_crud -> _dedup_context.set(read_crud));

        //TODO (ALEPH-20): move this into the DB (See related top level comment)
        final ElasticsearchTechnologyOverride tech_override = BeanTemplateUtils.from(
                Optional.ofNullable(_doc_schema.get().technology_override_schema()).orElse(Collections.emptyMap()),
                ElasticsearchTechnologyOverride.class).get();

        _db_mapper.set(f -> {
            return AnnotationBean._ID.equals(f) ? f
                    : tech_override.field_override().getOrDefault(f.replace(".", ":"),
                            f + tech_override.default_modifier());
        });

        _dedup_fields.set(Optional.ofNullable(_doc_schema.get().deduplication_fields())
                .orElse(Arrays.asList(AnnotationBean._ID)));
        _policy.set(
                Optional.ofNullable(_doc_schema.get().deduplication_policy()).orElse(DeduplicationPolicy.leave));

        if ((DeduplicationPolicy.custom == _policy.get()) || (DeduplicationPolicy.custom_update == _policy.get())) {

            Optional<EnrichmentControlMetadataBean> custom_config = Optionals
                    .ofNullable(_doc_schema.get().custom_deduplication_configs()).stream()
                    .filter(cfg -> Optional.ofNullable(cfg.enabled()).orElse(true)).findFirst();

            custom_config.ifPresent(cfg -> {

                getEnrichmentModules(context, cfg).stream().findFirst()
                        .ifPresent(module -> _custom_handler.set(module));

                _custom_context.set(new DeduplicationEnrichmentContext(context, _doc_schema.get(),
                        j -> getKeyFieldsAgain(j, getKeyFields(_dedup_fields.get()))));

                _custom_handler.optional()
                        .ifPresent(base_module -> base_module.onStageInitialize(_custom_context.get(), bucket, cfg,
                                previous_next, next_grouping_fields));
            });
        }
    }

    /* (non-Javadoc)
     * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentBatchModule#onObjectBatch(java.util.stream.Stream, java.util.Optional, java.util.Optional)
     */
    @Override
    public void onObjectBatch(final Stream<Tuple2<Long, IBatchRecord>> batch, final Optional<Integer> batch_size,
            final Optional<JsonNode> grouping_key) {
        if (_deduplication_is_disabled.get()) {
            // no deduplication, generally shouldn't be here...
            //.. but if we are, make do the best we can
            batch.forEach(t2 -> _context.get().emitImmutableObject(t2._1(), t2._2().getJson(), Optional.empty(),
                    Optional.empty(), Optional.empty()));
            return;
        }

        // Create big query

        final Tuple3<QueryComponent<JsonNode>, List<Tuple2<JsonNode, Tuple2<Long, IBatchRecord>>>, Either<String, List<String>>> fieldinfo_dedupquery_keyfields = getDedupQuery(
                batch, _dedup_fields.get(), _db_mapper.get());

        // Get duplicate results

        final Tuple2<List<String>, Boolean> fields_include = getIncludeFields(_policy.get(), _dedup_fields.get(),
                _timestamp_field.get());

        final CompletableFuture<Iterator<JsonNode>> dedup_res = fieldinfo_dedupquery_keyfields._2().isEmpty()
                ? CompletableFuture.completedFuture(Collections.<JsonNode>emptyList().iterator())
                : _dedup_context.get().getObjectsBySpec(fieldinfo_dedupquery_keyfields._1(), fields_include._1(),
                        fields_include._2()).thenApply(cursor -> cursor.iterator());

        // Wait for it to finsh

        //(create handy results structure if so)
        final LinkedHashMap<JsonNode, LinkedList<Tuple3<Long, IBatchRecord, ObjectNode>>> mutable_obj_map = fieldinfo_dedupquery_keyfields
                ._2().stream()
                .collect(Collector.of(
                        () -> new LinkedHashMap<JsonNode, LinkedList<Tuple3<Long, IBatchRecord, ObjectNode>>>(),
                        (acc, t2) -> {
                            // (ie only the first element is added, duplicate elements are removed)
                            final Tuple3<Long, IBatchRecord, ObjectNode> t3 = Tuples._3T(t2._2()._1(), t2._2()._2(),
                                    _mapper.createObjectNode());
                            acc.compute(t2._1(), (k, v) -> {
                                final LinkedList<Tuple3<Long, IBatchRecord, ObjectNode>> new_list = (null == v)
                                        ? new LinkedList<>()
                                        : v;
                                new_list.add(t3);
                                return new_list;
                            });
                        }, (map1, map2) -> {
                            map1.putAll(map2);
                            return map1;
                        }));

        //TODO (ALEPH-20): add timestamps to annotation
        //TODO (ALEPH-20): support different timestamp fields for the different buckets
        //TODO (ALEPH-20): really need to support >1 current enrichment job 
        //                 ^^(Really really longer term you should be able to decide what objects you want and what you don't  <- NOTE: don't remember what i meant here)

        final Iterator<JsonNode> cursor = dedup_res.join();

        // Handle the results

        final Stream<JsonNode> records_to_delete = Lambdas.get(() -> {
            if (isCustom(_doc_schema.get().deduplication_policy())
                    || _doc_schema.get().delete_unhandled_duplicates()) {
                return Optionals.streamOf(cursor, true)
                        .collect(Collectors.groupingBy(
                                ret_obj -> getKeyFieldsAgain(ret_obj, fieldinfo_dedupquery_keyfields._3())))
                        .entrySet().stream().<JsonNode>flatMap(kv -> {

                            final Optional<JsonNode> maybe_key = kv.getKey();
                            final Optional<LinkedList<Tuple3<Long, IBatchRecord, ObjectNode>>> matching_records = maybe_key
                                    .map(key -> mutable_obj_map.get(key));

                            // Stats:
                            _mutable_stats.duplicate_keys++;
                            _mutable_stats.duplicates_existing += kv.getValue().size();
                            _mutable_stats.duplicates_incoming += matching_records.map(l -> l.size()).orElse(0);

                            //DEBUG
                            //System.out.println("?? " + kv.getValue().size() + " vs " + maybe_key + " vs " + matching_records.map(x -> Integer.toString(x.size())).orElse("(no match)"));

                            return matching_records
                                    .<Stream<JsonNode>>map(records -> handleDuplicateRecord(_doc_schema.get(),
                                            _custom_handler.optional().map(
                                                    handler -> Tuples._2T(handler, this._custom_context.get())),
                                            _timestamp_field.get(), records, kv.getValue(), maybe_key.get(),
                                            mutable_obj_map))
                                    .orElse(Stream.empty());
                        });
            } else {
                Optionals.streamOf(cursor, true).forEach(ret_obj -> {
                    final Optional<JsonNode> maybe_key = getKeyFieldsAgain(ret_obj,
                            fieldinfo_dedupquery_keyfields._3());
                    final Optional<LinkedList<Tuple3<Long, IBatchRecord, ObjectNode>>> matching_records = maybe_key
                            .map(key -> mutable_obj_map.get(key));

                    //DEBUG
                    //System.out.println("?? " + ret_obj + " vs " + maybe_key + " vs " + matching_record.map(x -> x._2().getJson().toString()).orElse("(no match)"));

                    // Stats:
                    _mutable_stats.duplicate_keys++;
                    _mutable_stats.duplicates_existing++;
                    _mutable_stats.duplicates_incoming += matching_records.map(l -> l.size()).orElse(0);

                    matching_records.ifPresent(records -> handleDuplicateRecord(_doc_schema.get(),
                            _custom_handler.optional()
                                    .map(handler -> Tuples._2T(handler, this._custom_context.get())),
                            _timestamp_field.get(), records, Arrays.asList(ret_obj), maybe_key.get(),
                            mutable_obj_map));
                });
                return Stream.<JsonNode>empty();
            }
        });

        final List<Object> ids = records_to_delete.map(j -> jsonToObject(j)).filter(j -> null != j)
                .collect(Collectors.toList());

        if (!ids.isEmpty()) { // fire a bulk deletion request
            mutable_uncompleted_deletes.add(
                    _dedup_context.get().deleteObjectsBySpec(CrudUtils.allOf().withAny(AnnotationBean._ID, ids)));

            _mutable_stats.deleted += ids.size();

            //(quickly see if we can reduce the number of outstanding requests)
            final Iterator<CompletableFuture<Long>> it = mutable_uncompleted_deletes.iterator();
            while (it.hasNext()) {
                final CompletableFuture<Long> cf = it.next();
                if (cf.isDone()) {
                    it.remove();
                } else
                    break; // ie stop as soon as we hit one that isn't complete)
            }
        }

        _mutable_stats.nonduplicate_keys += mutable_obj_map.size();

        if (Optional.ofNullable(_doc_schema.get().custom_finalize_all_objects()).orElse(false)) {
            mutable_obj_map.entrySet().stream()
                    .forEach(kv -> handleCustomDeduplication(
                            _custom_handler.optional()
                                    .map(handler -> Tuples._2T(handler, this._custom_context.get())),
                            kv.getValue(), Collections.emptyList(), kv.getKey()));
        } else { // Just emit the last element of each grouped object set
            mutable_obj_map.values().stream().map(t -> t.peekLast())
                    .forEach(t -> _context.get().emitImmutableObject(t._1(), t._2().getJson(), Optional.of(t._3()),
                            Optional.empty(), Optional.empty()));
        }
    }

    /**Tidiness util (converts a long/int/double/float/stirng jsonnode value to its atomic type
     * @param obj
     * @return
     */
    protected static Object jsonToObject(final JsonNode j) {
        return Patterns.match(j).<Object>andReturn().when(jval -> jval.isTextual(), jval -> jval.asText())
                .when(jval -> jval.isLong(), jval -> jval.asLong()).when(jval -> jval.isInt(), jval -> jval.asInt())
                .when(jval -> jval.isFloat() || jval.isDouble(), jval -> jval.asDouble()).otherwise(__ -> null);
    }

    /** Tidiness util
     * @param policy
     * @return
     */
    private static boolean isCustom(final DeduplicationPolicy policy) {
        return (DeduplicationPolicy.custom == policy) || (DeduplicationPolicy.custom_update == policy);
    }

    /**Tidiness util
     * @param in
     * @return
     */
    private static Stream<JsonNode> deleteOtherDuplicates(final Stream<JsonNode> in) {
        return in.skip(1).map(j -> j.get(AnnotationBean._ID)).filter(j -> null != j);
    }

    /** The heart of the dedup logic
     *  (everything gets ordered in the correct order except if policy is custom, in which case the ordering is a
     *   bit arbitrary anyway)
     * @param policy
     * @param context
     * @param timestamp_field
     * @param new_record
     * @param old_record
     * @param key
     * @param mutable_obj_map
     * @returns a stream of objects to delete efficiently
     */
    protected static Stream<JsonNode> handleDuplicateRecord(final DocumentSchemaBean config,
            Optional<Tuple2<IEnrichmentBatchModule, DeduplicationEnrichmentContext>> custom_handler,
            final String timestamp_field, final LinkedList<Tuple3<Long, IBatchRecord, ObjectNode>> new_records,
            final List<JsonNode> old_records, final JsonNode key,
            final Map<JsonNode, LinkedList<Tuple3<Long, IBatchRecord, ObjectNode>>> mutable_obj_map) {
        return Patterns.match(config.deduplication_policy()).<Stream<JsonNode>>andReturn()
                .when(p -> p == DeduplicationPolicy.leave, __ -> {
                    mutable_obj_map.remove(key); //(drop new record)
                    return Stream.empty();
                }).when(p -> p == DeduplicationPolicy.update, __ -> {
                    final Tuple3<Long, IBatchRecord, ObjectNode> last_record = new_records.peekLast();
                    final JsonNode old_record = old_records.stream().findFirst().get();
                    if (newRecordUpdatesOld(timestamp_field, last_record._2().getJson(), old_record)) {
                        last_record._3().set(AnnotationBean._ID, old_record.get(AnnotationBean._ID));
                        return config.delete_unhandled_duplicates() ? deleteOtherDuplicates(old_records.stream())
                                : Stream.empty();
                    } else {
                        mutable_obj_map.remove(key); //(drop new record)            
                        return Stream.empty();
                    }
                }).when(p -> p == DeduplicationPolicy.overwrite, __ -> {
                    final Tuple3<Long, IBatchRecord, ObjectNode> last_record = new_records.peekLast();
                    // Just update the new record's "_id" field
                    final JsonNode old_record = old_records.stream().findFirst().get();
                    last_record._3().set(AnnotationBean._ID, old_record.get(AnnotationBean._ID));
                    return config.delete_unhandled_duplicates() ? deleteOtherDuplicates(old_records.stream())
                            : Stream.empty();
                }).when(p -> p == DeduplicationPolicy.custom_update, __ -> {
                    final Tuple3<Long, IBatchRecord, ObjectNode> last_record = new_records.peekLast();
                    final JsonNode old_record = old_records.stream().findFirst().get();
                    if (newRecordUpdatesOld(timestamp_field, last_record._2().getJson(), old_record)) {
                        mutable_obj_map.remove(key); // (since the "final step" logic is responsible for calling the update code)
                        return handleCustomDeduplication(custom_handler, new_records, old_records, key);
                    } else {
                        mutable_obj_map.remove(key); //(drop new record)
                        return Stream.empty();
                    }
                }).otherwise(__ -> {
                    mutable_obj_map.remove(key); // (since the "final step" logic is responsible for calling the update code)      
                    return handleCustomDeduplication(custom_handler, new_records, old_records, key);
                });
    }

    /** Returns the minimal set of includes to return from the dedup query
     * @param policy
     * @param dedup_fields
     * @param timestamp_field
     * @return
     */
    protected static Tuple2<List<String>, Boolean> getIncludeFields(final DeduplicationPolicy policy,
            final List<String> dedup_fields, String timestamp_field) {
        final Tuple2<List<String>, Boolean> fields_include = Optional
                .of(Patterns.match(policy).<Tuple2<List<String>, Boolean>>andReturn()
                        .when(p -> p == DeduplicationPolicy.leave,
                                __ -> Tuples._2T(Arrays.asList(AnnotationBean._ID), true))
                        .when(p -> p == DeduplicationPolicy.update,
                                __ -> Tuples._2T(Arrays.asList(AnnotationBean._ID, timestamp_field), true))
                        .when(p -> p == DeduplicationPolicy.overwrite,
                                __ -> Tuples._2T(Arrays.asList(AnnotationBean._ID), true))
                        .otherwise(__ -> Tuples._2T(Arrays.asList(), false)))
                .map(t2 -> t2._2() ? Tuples._2T(
                        Stream.concat(t2._1().stream(), dedup_fields.stream()).collect(Collectors.toList()),
                        t2._2()) : t2)
                .get();

        return fields_include;
    }

    /** Logic to perform the custom deduplication with the current and new versions
     * @param maybe_custom_handler
     * @param new_record
     * @param old_record
     * @returns list of Json objects to delete
     */
    protected static Stream<JsonNode> handleCustomDeduplication(
            Optional<Tuple2<IEnrichmentBatchModule, DeduplicationEnrichmentContext>> maybe_custom_handler,
            final List<Tuple3<Long, IBatchRecord, ObjectNode>> new_records, final Collection<JsonNode> old_records,
            final JsonNode key) {
        return maybe_custom_handler.map(handler_context -> {
            handler_context._2().resetMutableState(old_records, key);

            final Consumer<IEnrichmentBatchModule> handler = new_module -> {
                final Stream<Tuple2<Long, IBatchRecord>> dedup_stream = Stream.concat(
                        new_records.stream().map(t3 -> Tuples._2T(t3._1(), t3._2())),
                        old_records.stream().map(old_record -> Tuples._2T(-1L,
                                (IBatchRecord) (new BatchRecordUtils.InjectedJsonBatchRecord(old_record)))));

                final int batch_size = new_records.size();

                new_module.onObjectBatch(dedup_stream, Optional.of(batch_size).filter(__ -> !old_records.isEmpty()), // (ie leave batch size blank if there's no dedup) 
                        Optional.of(key));

                new_module.onStageComplete(false);
            };

            handler.accept(handler_context._1());

            return handler_context._2().getObjectIdsToDelete();
        }).orElse(Stream.empty());
    }

    /** Compares the old and new records' timestamps (if either doesn't exist then assume we're leaving)
     *  (so that if the time isn't present then doesn't hammer the DB)
     * @param timestamp_field
     * @param new_record
     * @param old_record
     * @return
     */
    protected static boolean newRecordUpdatesOld(String timestamp_field, final JsonNode new_record,
            final JsonNode old_record) {
        final Optional<JsonNode> old_timestamp = JsonUtils.getProperty(timestamp_field, old_record);
        final Optional<JsonNode> new_timestamp = JsonUtils.getProperty(timestamp_field, new_record);
        final Optional<Tuple2<Long, Long>> maybe_old_new = old_timestamp
                .flatMap(old_ts -> getTimestampFromJsonNode(old_ts))
                .flatMap(old_ts -> new_timestamp.flatMap(new_ts -> getTimestampFromJsonNode(new_ts))
                        .map(new_ts -> Tuples._2T(old_ts, new_ts)));

        return maybe_old_new.filter(old_new -> old_new._2() > old_new._1()).isPresent();
    }

    /** Converts a JsonNode to a timestamp if possible
     * @param in
     * @return
     */
    public static Optional<Long> getTimestampFromJsonNode(final JsonNode in) {
        if (null == in) {
            return Optional.empty();
        } else if (in.isNumber()) {
            return Optional.of(in.asLong());
        } else if (in.isTextual()) {
            return Optional.ofNullable(
                    TimeUtils.parseIsoString(in.asText()).validation(fail -> null, success -> success.getTime()));
        } else {
            return Optional.empty();
        }
    }

    /** Utility that performs a cheap part of getDedupQuery
     * @param dedup_fields
     * @return
     */
    protected static Either<String, List<String>> getKeyFields(final List<String> dedup_fields) {
        if (1 == dedup_fields.size()) { // this is a simpler case
            final String key_field = dedup_fields.stream().findFirst().get();
            return Either.left(key_field);
        } else {
            return Either.right(dedup_fields);
        }
    }

    /** Creates the query and some associated metadata (see also getKeyFields)
     * @param batch
     * @param dedup_fields
     * @param db_field_mapper - allows the fields to be transformed (initial workaround for some ES issues, can just leave and pass f->f in once no longer needed)
     * @return a 3-tuple containing: the query to apply, the list of records indexed by the key, the field-or-fields that form the key
     */
    protected static Tuple3<QueryComponent<JsonNode>, List<Tuple2<JsonNode, Tuple2<Long, IBatchRecord>>>, Either<String, List<String>>> getDedupQuery(
            final Stream<Tuple2<Long, IBatchRecord>> batch, final List<String> dedup_fields,
            final Function<String, String> db_field_mapper) {
        if (1 == dedup_fields.size()) { // this is a simpler case
            final String key_field = dedup_fields.stream().findFirst().get();
            final List<Tuple2<JsonNode, Tuple2<Long, IBatchRecord>>> field_info = extractKeyField(batch, key_field);
            return Tuples._3T(
                    CrudUtils.allOf()
                            .withAny(db_field_mapper.apply(key_field),
                                    field_info.stream().map(t2 -> JsonUtils.jacksonToJava(t2._1()))
                                            .collect(Collectors.toList()))
                            .limit(Integer.MAX_VALUE),
                    field_info, getKeyFields(dedup_fields));
        } else {
            final List<Tuple2<JsonNode, Tuple2<Long, IBatchRecord>>> field_info = extractKeyFields(batch,
                    dedup_fields);

            final Stream<QueryComponent<JsonNode>> elements = field_info.stream().map(t2 -> {
                return Optionals.streamOf(t2._1().fields(), false).reduce(CrudUtils.allOf(), (acc, kv) -> acc
                        .when(db_field_mapper.apply(kv.getKey()), JsonUtils.jacksonToJava(kv.getValue())),
                        (acc1, acc2) -> acc1 // (not possible because not parallel()
                );
            });

            final QueryComponent<JsonNode> query_dedup = Optional
                    .of(CrudUtils.anyOf(elements).limit(Integer.MAX_VALUE))
                    //(consider putting a sort in here? I haven't so far because it seems like a lot of work when mostly there will just be a single object
                    // for each batch so we're doing a lot of sorting completely unnecessarily
                    // ... the downside is that if there are a large number of duplicates then the memory usage could get painful if they're not sorted...)
                    .get();

            return Tuples._3T(query_dedup, field_info, getKeyFields(dedup_fields));
        }
    }

    /** Utility to find fragments of a json object from single/multiple fields
     * @param in
     * @param key_fields
     * @return
     */
    protected static Optional<JsonNode> getKeyFieldsAgain(final JsonNode in,
            Either<String, List<String>> key_field_or_fields) {
        return key_field_or_fields.either(key_field -> extractKeyField(in, key_field),
                key_fields -> extractKeyFields(in, key_fields));
    }

    /** Utility to find a single field for dedup purposes 
     * @param in - stream of JSON objects
     * @param key_field
     * @return
     */
    protected static List<Tuple2<JsonNode, Tuple2<Long, IBatchRecord>>> extractKeyField(
            final Stream<Tuple2<Long, IBatchRecord>> in, final String key_field) {
        return in.map(x -> extractKeyField(x._2().getJson(), key_field).map(y -> Tuples._2T(y, x)).orElse(null))
                .filter(x -> null != x).collect(Collectors.toList());
    }

    /** Utility to find a single field for dedup purposes 
     * @param in - single JSON object
     * @param key_field
     * @return
     */
    protected static Optional<JsonNode> extractKeyField(final JsonNode in, final String key_field) {
        return JsonUtils.getProperty(key_field, in).filter(j -> j.isValueNode());
    }

    /** Utility to find a multiple-field set of values for dedup purposes 
     * @param in - stream of JSON objects
     * @param key_fields
     * @return
     */
    protected static List<Tuple2<JsonNode, Tuple2<Long, IBatchRecord>>> extractKeyFields(
            final Stream<Tuple2<Long, IBatchRecord>> in, final List<String> key_fields) {
        return in.map(x -> extractKeyFields(x._2().getJson(), key_fields).map(y -> Tuples._2T(y, x)).orElse(null))
                .filter(x -> null != x).collect(Collectors.toList());
    }

    /** Utility to find a multiple-field set of values for dedup purposes 
     * @param in - single JSON object
     * @param key_fields
     * @return
     */
    protected static Optional<JsonNode> extractKeyFields(final JsonNode in, final List<String> key_fields) {
        final ObjectNode on = key_fields.stream()
                .reduce(_mapper.createObjectNode(),
                        (acc, v) -> JsonUtils.getProperty(v, in)
                                .<ObjectNode>map(val -> (ObjectNode) acc.set(v, val)).orElse(acc),
                        (acc1, acc2) -> acc1 //(not possible because not parallel())
        );

        return Optional.of((JsonNode) on).filter(o -> 0 != o.size());
    }

    /* (non-Javadoc)
     * @see com.ikanow.aleph2.data_model.interfaces.data_import.IEnrichmentBatchModule#onStageComplete(boolean)
     */
    @SuppressWarnings("unchecked")
    @Override
    public void onStageComplete(final boolean is_original) {
        _custom_handler.optional().ifPresent(handler -> handler.onStageComplete(true));

        final Supplier<String> subsystem_builder = () -> (_is_system_dedup_stage.get() ? ""
                : ("." + _control.get().name() + Optional.ofNullable("no_name")));
        final Supplier<String> command_builder = () -> (_is_system_dedup_stage.get() ? "system"
                : _control.get().name() + Optional.ofNullable("no_name"));

        _logger.optional().ifPresent(l -> l.log(Level.DEBUG, ErrorUtils.lazyBuildMessage(true,
                () -> "DeduplicationService" + subsystem_builder.get(),
                () -> command_builder.get() + ".onStageComplete", () -> null,
                () -> ErrorUtils.get(
                        "Job {0} completed deduplication: nondup_keys={1}, dup_keys={2}, dups_inc={3}, dups_db={4}, del={5}",
                        command_builder.get(), Integer.toString(_mutable_stats.nonduplicate_keys),
                        Integer.toString(_mutable_stats.duplicate_keys),
                        Integer.toString(_mutable_stats.duplicates_incoming),
                        Integer.toString(_mutable_stats.duplicates_existing),
                        Integer.toString(_mutable_stats.deleted)),
                () -> (Map<String, Object>) _mapper.convertValue(_mutable_stats, Map.class))));

        if (!mutable_uncompleted_deletes.isEmpty()) {
            try {
                CompletableFuture.allOf(mutable_uncompleted_deletes.stream().toArray(CompletableFuture[]::new))
                        .get(60, TimeUnit.SECONDS);
            } catch (Exception e) {
                _logger.optional().ifPresent(l -> l.log(Level.ERROR,
                        ErrorUtils.lazyBuildMessage(false, () -> "DeduplicationService" + subsystem_builder.get(),
                                () -> command_builder.get() + ".onStageComplete", () -> null,
                                () -> ErrorUtils.get("Job {0}: error completing deleted ids: {1}",
                                        command_builder.get(), e.getMessage()),
                                () -> null)));
            }
        }
        _logger.optional().ifPresent(Lambdas.wrap_consumer_u(l -> l.flush().get(60, TimeUnit.SECONDS)));
    }

    /** Utility function to allow the user to override data service to use in the lookup
     * @param doc_schema
     * @return
     */
    protected static Validation<String, Tuple2<Optional<Class<? extends IUnderlyingService>>, Optional<String>>> getDataService(
            final DocumentSchemaBean doc_schema) {
        final Tuple2<Optional<Class<? extends IUnderlyingService>>, Optional<String>> service_to_use = Optional
                .ofNullable(doc_schema.lookup_service_override()).filter(s -> !s.isEmpty())
                .map(s -> s.split("[.:]", 2))
                .map(s2 -> Tuples._2T(DataServiceUtils.getUnderlyingServiceInterface(s2[0]),
                        (s2.length > 1) ? Optional.ofNullable(s2[1]) : Optional.<String>empty()))
                .orElseGet(() -> Tuples._2T(Optional.of(IDocumentService.class),
                        Optional.ofNullable(doc_schema.service_name())));

        if (!service_to_use._1().isPresent()) {
            return Validation
                    .fail(ErrorUtils.get(ErrorUtils.INVALID_LOOKUP_SERVICE, doc_schema.lookup_service_override()));
        } else
            return Validation.success(service_to_use);
    }

    /** Utility to get the list (currently 0/1) of enrichment modules that define deduplication handling
     * @param doc_schema
     * @return
     */
    final protected static Collection<IEnrichmentBatchModule> getEnrichmentModules(
            final IEnrichmentModuleContext context, final EnrichmentControlMetadataBean cfg) {

        final Optional<String> entry_point = Optional.ofNullable(cfg.entry_point()).map(Optional::of)
                .orElseGet(() -> {
                    // Get the shared library bean:

                    return BucketUtils
                            .getBatchEntryPoint(
                                    context.getServiceContext().getCoreManagementDbService().readOnlyVersion()
                                            .getSharedLibraryStore()
                                            .getObjectBySpec(CrudUtils.anyOf(SharedLibraryBean.class)
                                                    .when(SharedLibraryBean::_id, cfg.module_name_or_id())
                                                    .when(SharedLibraryBean::path_name, cfg.module_name_or_id()))
                                            .join()
                                            .map(bean -> (Map<String, SharedLibraryBean>) ImmutableMap
                                                    .of(cfg.module_name_or_id(), bean))
                                            .orElse(Collections.<String, SharedLibraryBean>emptyMap()),
                                    cfg);
                });

        return entry_point
                .map(Lambdas.wrap_u(ep -> (IEnrichmentBatchModule) Class
                        .forName(ep, true, Thread.currentThread().getContextClassLoader()).newInstance()))
                .map(i -> Arrays.asList(i)).orElse(Collections.emptyList());
    }

}