no.ssb.vtl.script.operations.join.AbstractJoinOperation.java Source code

Java tutorial

Introduction

Here is the source code for no.ssb.vtl.script.operations.join.AbstractJoinOperation.java

Source

package no.ssb.vtl.script.operations.join;

/*-
 * ========================LICENSE_START=================================
 * Java VTL
 * %%
 * Copyright (C) 2016 - 2017 Hadrien Kohl
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * =========================LICENSE_END==================================
 */

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableTable;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Sets;
import com.google.common.collect.Table;
import no.ssb.vtl.model.AbstractDatasetOperation;
import no.ssb.vtl.model.Component;
import no.ssb.vtl.model.DataPoint;
import no.ssb.vtl.model.DataStructure;
import no.ssb.vtl.model.Dataset;
import no.ssb.vtl.model.Order;
import no.ssb.vtl.model.VTLObject;
import no.ssb.vtl.script.support.Closer;
import no.ssb.vtl.script.support.JoinSpliterator;

import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import static com.google.common.base.Preconditions.*;
import static no.ssb.vtl.model.Order.Direction.*;

/**
 * Abstract join operation.
 * <p>
 * Contains the base logic for inner join and outer join operations.
 */
public abstract class AbstractJoinOperation extends AbstractDatasetOperation implements WorkingDataset {

    private static final String ERROR_EMPTY_DATASET_LIST = "join operation impossible on empty dataset list";
    private static final String ERROR_INCOMPATIBLE_TYPES = "incompatible identifier types: %s";
    private static final String ERROR_NO_COMMON_IDENTIFIERS = "could not find common identifiers in the datasets %s";

    private final Table<Component, Dataset, Component> componentMapping;
    protected final ImmutableMap<String, Dataset> datasets;
    private final ImmutableSet<Component> commonIdentifiers;

    private final ComponentBindings joinScope;

    AbstractJoinOperation(Map<String, Dataset> namedDatasets, Set<Component> identifiers) {
        super(Lists.newArrayList(checkNotNull(namedDatasets).values()));

        checkArgument(!namedDatasets.isEmpty(), ERROR_EMPTY_DATASET_LIST);
        this.datasets = ImmutableMap.copyOf(checkNotNull(namedDatasets));

        checkNotNull(identifiers);

        this.joinScope = createJoinScope(namedDatasets);

        this.componentMapping = createComponentMapping(this.datasets.values());

        Map<Component, Map<Dataset, Component>> idMap = this.componentMapping.rowMap().entrySet().stream()
                .filter(e -> e.getKey().isIdentifier()).filter(e -> e.getValue().size() == datasets.size())
                // identifiers can be from any dataset
                .filter(e -> identifiers.isEmpty() || !Collections.disjoint(e.getValue().values(), identifiers))
                .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));

        // No common identifier
        checkArgument(namedDatasets.size() == 1 || !idMap.isEmpty(), ERROR_NO_COMMON_IDENTIFIERS, namedDatasets);

        this.commonIdentifiers = ImmutableSet.copyOf(idMap.keySet());

        // Type mismatch check
        List<String> typeMismatches = Lists.newArrayList();
        for (Map.Entry<Component, Map<Dataset, Component>> entry : idMap.entrySet()) {
            Component identifier = entry.getKey();

            Multimap<Class<?>, Dataset> typeMap = ArrayListMultimap.create();
            entry.getValue().entrySet().forEach(datasetComponent -> {
                typeMap.put(datasetComponent.getValue().getType(), datasetComponent.getKey());
            });
            if (typeMap.keySet().size() != 1)
                typeMismatches.add(String.format("%s -> (%s)", identifier, typeMap));
        }
        checkArgument(typeMismatches.isEmpty(), ERROR_INCOMPATIBLE_TYPES, String.join(", ", typeMismatches));
    }

    /**
     * Creates a Bindings that contains the unique components of this join operation and the
     * datasets.
     */
    @VisibleForTesting
    static ComponentBindings createJoinScope(Map<String, Dataset> namedDatasets) {
        return new ComponentBindings(namedDatasets);
    }

    /**
     * Create a table that maps the components of the resulting dataset to the component of the underlying
     * datasets.
     *
     * <pre>
     * +------+-----------------+
     * |  re  |     Dataset     |
     * |  su  +-----+-----+-----+
     * |  lt  | ds1 | dsN | ... |
     * +------+-----+-----+-----+
     * | ref  | ref | ref |     |
     * +------+-----+-----+     +
     * | ref  | ref | ref |     |
     * +------+-----+-----+     +
     * | ...  |                 |
     * +------+-----+-----+-----+
     * </pre>
     *
     * @param datasets the datasets
     * @return the component table
     */
    @VisibleForTesting
    static Table<Component, Dataset, Component> createComponentMapping(Iterable<Dataset> datasets) {

        Map<String, Component> seen = Maps.newHashMap();

        ImmutableTable.Builder<Component, Dataset, Component> table;
        table = ImmutableTable.builder();

        for (Dataset dataset : datasets) {
            DataStructure structure = dataset.getDataStructure();
            for (Map.Entry<String, Component> entry : structure.entrySet()) {
                Component component;

                if (entry.getValue().isIdentifier())
                    component = seen.computeIfAbsent(entry.getKey(), s -> entry.getValue());
                else
                    component = entry.getValue();

                table.put(component, dataset, entry.getValue());
            }
        }

        return table.build();
    }

    @Deprecated
    private static Function<DataPoint, Map<Component, VTLObject>> createKeyExtractor(
            final DataStructure structure) {
        return dataPoint -> dataPoint != null ? structure.asMap(dataPoint) : null;
    }

    /**
     * This method is deprecated.
     * <p>
     * Merger are closely related to the type of join. OuterJoin will be
     * refactored to stop using this method.
     */
    @Deprecated
    protected abstract BiFunction<DataPoint, DataPoint, DataPoint> getMerger(Dataset leftDataset,
            Dataset rightDataset);

    /**
     * Ensure sorted.
     */
    protected Stream<DataPoint> sortIfNeeded(Dataset dataset, Order order) {
        // Adjust the order to the structure.

        Order.Builder adjustedOrder = Order.create(dataset.getDataStructure());
        Table<Component, Dataset, Component> mapping = getComponentMapping();

        for (Map.Entry<Component, Order.Direction> orderEntry : order.entrySet()) {
            Map<Dataset, Component> rowMapping = mapping.row(orderEntry.getKey());
            if (!rowMapping.containsKey(dataset))
                continue;

            Component component = rowMapping.get(dataset);
            if (component.isIdentifier()) {
                Order.Direction direction = orderEntry.getValue();
                adjustedOrder.put(component, direction);
            }
        }
        return dataset.getData(adjustedOrder.build())
                .orElseGet(() -> dataset.getData().sorted(adjustedOrder.build()));
    }

    @Deprecated
    private Comparator<Map<Component, VTLObject>> createKeyComparator(Dataset rightDataset, Order order) {

        // Only check the values of the common identifiers.

        HashSet<Component> commonComponents = Sets.newHashSet(getCommonIdentifiers());
        final Map<Component, Order.Direction> commonOrder = Maps.filterKeys(order, commonComponents::contains);
        final Table<Component, Dataset, Component> componentMap = getComponentMapping();
        return (left, right) -> {

            if (left == null)
                return -1;
            if (right == null)
                return 1;
            if (left == right)
                return 0;

            int result;
            for (Map.Entry<Component, Order.Direction> entry : commonOrder.entrySet()) {
                Component component = entry.getKey();
                Order.Direction direction = entry.getValue();

                Map<Dataset, Component> map = componentMap.row(component);

                Component leftComponent = component; // kept for clarity
                Component rightComponent = map.get(rightDataset);

                VTLObject leftValue = left.get(leftComponent);
                VTLObject rightValue = right.get(rightComponent);

                result = Order.NULLS_FIRST.compare(leftValue, rightValue);

                if (result != 0)
                    return direction == ASC ? result : -1 * result;

            }
            return 0;
        };
    }

    @Override
    public Optional<Stream<DataPoint>> getData(Order requestedOrder, Filtering filtering, Set<String> components) {

        // Optimization.
        if (datasets.size() == 1) {
            Dataset dataset = datasets.values().iterator().next();
            return Optional.of(sortIfNeeded(dataset, requestedOrder));
        }

        // Check if requested order is compatible.
        Optional<Order> order = createCompatibleOrder(getDataStructure(), getCommonIdentifiers(), requestedOrder);
        if (!order.isPresent())
            return Optional.empty();

        // TODO: Filtering

        // TODO: Components

        Iterator<Dataset> iterator = datasets.values().iterator();

        Dataset left = iterator.next();
        Dataset right = left;

        // Create the resulting data points.
        final DataStructure joinStructure = getDataStructure();
        final DataStructure structure = left.getDataStructure();

        // Close all children
        Closer closer = Closer.create();

        Stream<DataPoint> result = closer.register(sortIfNeeded(left, requestedOrder)
                .map(dataPoint -> joinStructure.fromMap(structure.asMap(dataPoint))));

        while (iterator.hasNext()) {
            left = right;
            right = iterator.next();
            result = StreamSupport.stream(new JoinSpliterator<>(createKeyComparator(right, requestedOrder),
                    result.spliterator(), closer.register(sortIfNeeded(right, requestedOrder)).spliterator(),
                    createKeyExtractor(joinStructure), createKeyExtractor(right.getDataStructure()),
                    getMerger(left, right)), false);
        }
        return Optional.of(result.onClose(() -> {
            try {
                closer.close();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }));
    }

    protected ImmutableSet<Component> getCommonIdentifiers() {
        return this.commonIdentifiers;
    }

    @Override
    public final Stream<DataPoint> getData() {
        // Use the order that is best; using the
        // identifiers we are joining on only.
        Order.Builder orderBuilder = Order.create(getDataStructure());
        for (Component identifier : getCommonIdentifiers()) {
            // TODO: Direction.ANY
            orderBuilder.put(identifier, ASC);
        }
        return getData(orderBuilder.build()).orElseThrow(() -> new RuntimeException("could not sort data"));
    }

    /**
     * Try to create an order that is compatible with the join using the requested order
     * <p>
     * Join operations need the common identifiers to be first.
     *
     * @param structure
     * @param firstComponents
     * @param requestedOrder  the requested order
     */
    @VisibleForTesting
    Optional<Order> createCompatibleOrder(DataStructure structure, ImmutableSet<Component> firstComponents,
            Order requestedOrder) {

        Set<Component> identifiers = Sets.newHashSet(firstComponents);

        Order.Builder compatibleOrder = Order.create(structure);
        for (Map.Entry<Component, Order.Direction> order : requestedOrder.entrySet()) {
            Component key = order.getKey();
            Order.Direction direction = order.getValue();

            if (!identifiers.isEmpty() && !identifiers.remove(key))
                return Optional.empty();

            compatibleOrder.put(key, direction);
        }
        return Optional.of(compatibleOrder.build());
    }

    /**
     * Checks if component name is unique among other datasets
     */
    @VisibleForTesting
    boolean componentNameIsUnique(String datasetName, String componentName) {
        for (String otherDatasetName : datasets.keySet()) {
            if (!datasetName.equals(otherDatasetName)) {
                DataStructure structure = datasets.get(otherDatasetName).getDataStructure();
                if (!Sets.intersection(structure.keySet(), Sets.newHashSet(componentName)).isEmpty()) {
                    return false;
                }
            }
        }

        return true;
    }

    @Override
    protected DataStructure computeDataStructure() {
        // Optimization.
        if (datasets.size() == 1) {
            return datasets.values().iterator().next().getDataStructure();
        }

        Set<String> ids = Sets.newHashSet();
        DataStructure.Builder newDataStructure = DataStructure.builder();
        for (String datasetName : datasets.keySet()) {
            DataStructure structure = datasets.get(datasetName).getDataStructure();
            for (Map.Entry<String, Component> componentEntry : structure.entrySet()) {
                if (!componentEntry.getValue().isIdentifier()) {
                    if (componentNameIsUnique(datasetName, componentEntry.getKey())) {
                        newDataStructure.put(componentEntry.getKey(), componentEntry.getValue());
                    } else {
                        newDataStructure.put(datasetName.concat("_".concat(componentEntry.getKey())),
                                componentEntry.getValue());
                    }
                } else {
                    if (ids.add(componentEntry.getKey())) {
                        newDataStructure.put(componentEntry);
                    }
                }
            }
        }
        return newDataStructure.build();
    }

    public ComponentBindings getJoinScope() {
        return joinScope;
    }

    public Table<Component, Dataset, Component> getComponentMapping() {
        return componentMapping;
    }

}