org.apache.tinkerpop.gremlin.structure.util.batch.BatchGraph.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tinkerpop.gremlin.structure.util.batch.BatchGraph.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.tinkerpop.gremlin.structure.util.batch;

import org.apache.commons.configuration.BaseConfiguration;
import org.apache.commons.configuration.Configuration;
import org.apache.tinkerpop.gremlin.process.T;
import org.apache.tinkerpop.gremlin.process.Traversal;
import org.apache.tinkerpop.gremlin.process.TraversalEngine;
import org.apache.tinkerpop.gremlin.process.computer.GraphComputer;
import org.apache.tinkerpop.gremlin.process.graph.traversal.GraphTraversal;
import org.apache.tinkerpop.gremlin.process.graph.traversal.VertexTraversal;
import org.apache.tinkerpop.gremlin.process.traversal.engine.StandardTraversalEngine;
import org.apache.tinkerpop.gremlin.structure.Direction;
import org.apache.tinkerpop.gremlin.structure.Edge;
import org.apache.tinkerpop.gremlin.structure.Element;
import org.apache.tinkerpop.gremlin.structure.Graph;
import org.apache.tinkerpop.gremlin.structure.Property;
import org.apache.tinkerpop.gremlin.structure.Transaction;
import org.apache.tinkerpop.gremlin.structure.Vertex;
import org.apache.tinkerpop.gremlin.structure.VertexProperty;
import org.apache.tinkerpop.gremlin.structure.util.ElementHelper;
import org.apache.tinkerpop.gremlin.structure.util.batch.cache.VertexCache;
import org.apache.tinkerpop.gremlin.util.iterator.IteratorUtils;

import java.util.Collections;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.Function;

/**
 * {@code BatchGraph} is a wrapper that enables batch loading of a large number of edges and vertices by chunking the entire
 * load into smaller batches and maintaining a sideEffects-efficient vertex cache so that the entire transactional state can
 * be flushed after each chunk is loaded.
 * <br />
 * {@code BatchGraph} is ONLY meant for loading data and does not support any retrieval or removal operations.
 * That is, BatchGraph only supports the following methods:
 * - {@link #addVertex(Object...)} for adding vertices
 * - {@link Vertex#addEdge(String, org.apache.tinkerpop.gremlin.structure.Vertex, Object...)} for adding edges
 * - {@link #V(Object...)} to be used when adding edges
 * - Property getter, setter and removal methods for vertices and edges.
 * <br />
 * An important limitation of BatchGraph is that edge properties can only be set immediately after the edge has been added.
 * If other vertices or edges have been created in the meantime, setting, getting or removing properties will throw
 * exceptions. This is done to avoid caching of edges which would require a great amount of sideEffects.
 * <br />
 * {@code BatchGraph} can also automatically set the provided element ids as properties on the respective element. Use
 * {@link Builder#vertexIdKey(String)} and {@link Builder#edgeIdKey(String)} to set the keys
 * for the vertex and edge properties respectively. This allows to make the loaded baseGraph compatible for later
 * operation with {@link org.apache.tinkerpop.gremlin.structure.strategy.IdStrategy}.
 * <br/>
 * Note that {@code BatchGraph} itself is not a {@link org.apache.tinkerpop.gremlin.structure.strategy.GraphStrategy} because
 * it requires that the {@link Vertex} implementation not hold on to the underlying {@link Vertex} reference and
 * {@link org.apache.tinkerpop.gremlin.structure.strategy.StrategyVertex} does that by it's very nature.  While it might
 * be possible to work around this issue, it is likely better for performance to simply leave this as a "half-wrapper"
 * implementation, instead of forcing it into a {@link org.apache.tinkerpop.gremlin.structure.strategy.GraphStrategy}.
 *
 * @author Matthias Broecheler (http://www.matthiasb.com)
 * @author Stephen Mallette (http://stephen.genoprime.com)
 * @author Marko A. Rodriguez (http://markorodriguez.com)
 */
public class BatchGraph<G extends Graph> implements Graph, Graph.Iterators {
    /**
     * Default buffer size is 10000.
     */
    public static final long DEFAULT_BUFFER_SIZE = 10000;

    private final G baseGraph;

    private final String vertexIdKey;
    private final String edgeIdKey;
    private final boolean incrementalLoading;
    private final boolean baseSupportsSuppliedVertexId;
    private final boolean baseSupportsSuppliedEdgeId;
    private final boolean baseSupportsTransactions;
    private final BiConsumer<Element, Object[]> existingVertexStrategy;
    private final BiConsumer<Element, Object[]> existingEdgeStrategy;

    private final VertexCache cache;

    private final long bufferSize;
    private long remainingBufferSize;

    private BatchEdge currentEdge = null;
    private Edge currentEdgeCached = null;

    private Object previousOutVertexId = null;

    private final BatchFeatures batchFeatures;

    private final Transaction batchTransaction;

    /**
     * Constructs a BatchGraph wrapping the provided baseGraph, using the specified buffer size and expecting vertex
     * ids of the specified IdType. Supplying vertex ids which do not match this type will throw exceptions.
     *
     * @param graph      Graph to be wrapped
     * @param type       Type of vertex id expected. This information is used to apply the vertex cache
     *                   sideEffects footprint.
     * @param bufferSize Defines the number of vertices and edges loaded before starting a new transaction. The
     *                   larger this value, the more sideEffects is required but the faster the loading process.
     */
    private BatchGraph(final G graph, final VertexIdType type, final long bufferSize, final String vertexIdKey,
            final String edgeIdKey, final boolean incrementalLoading,
            final BiConsumer<Element, Object[]> existingVertexStrategy,
            final BiConsumer<Element, Object[]> existingEdgeStrategy) {
        this.baseGraph = graph;
        this.batchTransaction = new BatchTransaction();
        this.batchFeatures = new BatchFeatures(graph.features());
        this.bufferSize = bufferSize;
        this.cache = type.getVertexCache();
        this.remainingBufferSize = this.bufferSize;
        this.vertexIdKey = vertexIdKey;
        this.edgeIdKey = edgeIdKey;
        this.incrementalLoading = incrementalLoading;
        this.baseSupportsSuppliedEdgeId = this.baseGraph.features().edge().supportsUserSuppliedIds();
        this.baseSupportsSuppliedVertexId = this.baseGraph.features().vertex().supportsUserSuppliedIds();
        this.baseSupportsTransactions = this.baseGraph.features().graph().supportsTransactions();
        this.existingEdgeStrategy = existingEdgeStrategy;
        this.existingVertexStrategy = existingVertexStrategy;
    }

    private void nextElement() {
        currentEdge = null;
        currentEdgeCached = null;
        if (remainingBufferSize <= 0) {
            if (this.baseSupportsTransactions)
                baseGraph.tx().commit();
            cache.newTransaction();
            remainingBufferSize = bufferSize;
        }
        remainingBufferSize--;
    }

    private Vertex retrieveFromCache(final Object externalID) {
        final Object internal = cache.getEntry(externalID);
        if (internal instanceof Vertex) {
            return (Vertex) internal;
        } else if (internal != null) { //its an internal id
            final Vertex v = baseGraph.V(internal).next();
            cache.set(v, externalID);
            return v;
        } else
            return null;
    }

    private Vertex getCachedVertex(final Object externalID) {
        final Vertex v = retrieveFromCache(externalID);
        if (v == null)
            throw new IllegalArgumentException("Vertex for given ID cannot be found: " + externalID);
        return v;
    }

    @Override
    public Vertex addVertex(final Object... keyValues) {
        final Object id = ElementHelper.getIdValue(keyValues)
                .orElseThrow(() -> new IllegalArgumentException("Vertex id value cannot be null"));
        if (!incrementalLoading && retrieveFromCache(id) != null)
            throw new IllegalArgumentException("Vertex id already exists");
        nextElement();

        // if the vertexIdKey is not the T.id then append it as a name/value pair.  this will overwrite what
        // is present in that field already
        final Object[] keysVals = T.id.getAccessor().equals(vertexIdKey) ? keyValues
                : ElementHelper.upsert(keyValues, vertexIdKey, id);

        // if the graph doesn't support vertex ids or the vertex id is not the T.id then remove that key
        // value pair as it will foul up insertion (i.e. an exception for graphs that don't support it and the
        // id will become the value of the vertex id which might not be expected.
        final Optional<Object[]> kvs = this.baseSupportsSuppliedVertexId && T.id.getAccessor().equals(vertexIdKey)
                ? Optional.ofNullable(keyValues)
                : ElementHelper.remove(T.id, keysVals);

        Vertex currentVertex;
        if (!incrementalLoading)
            currentVertex = kvs.isPresent() ? baseGraph.addVertex(kvs.get()) : baseGraph.addVertex();
        else {
            final Traversal<Vertex, Vertex> traversal = baseGraph.V().has(vertexIdKey, id);
            if (traversal.hasNext()) {
                final Vertex v = traversal.next();
                if (traversal.hasNext())
                    throw new IllegalStateException(
                            String.format("There is more than one vertex identified by %s=%s", vertexIdKey, id));

                // let the caller decide how to handle conflict
                kvs.ifPresent(keyvals -> existingVertexStrategy.accept(v, keyvals));
                currentVertex = v;
            } else
                currentVertex = kvs.isPresent() ? baseGraph.addVertex(kvs.get()) : baseGraph.addVertex();
        }

        cache.set(currentVertex, id);

        return new BatchVertex(id);
    }

    @Override
    public GraphTraversal<Edge, Edge> E(final Object... edgeIds) {
        throw retrievalNotSupported();
    }

    @Override
    public Iterators iterators() {
        return this;
    }

    @Override
    public Iterator<Vertex> vertexIterator(final Object... vertexIds) {
        if (vertexIds.length > 1)
            throw new IllegalArgumentException("BatchGraph only allows a single vertex id at one time");
        if ((this.previousOutVertexId != null) && (this.previousOutVertexId.equals(vertexIds[0]))) {
            return IteratorUtils.of(new BatchVertex(this.previousOutVertexId));
        } else {
            Vertex vertex = retrieveFromCache(vertexIds[0]);
            if (null == vertex) {
                if (!this.incrementalLoading)
                    return Collections.emptyIterator();
                else {
                    final Iterator<Vertex> iterator = this.baseGraph.V().has(this.vertexIdKey, vertexIds[0]);
                    if (!iterator.hasNext())
                        return Collections.emptyIterator();
                    vertex = iterator.next();
                    if (iterator.hasNext())
                        throw new IllegalStateException(
                                "There are multiple vertices with the provided id in the graph: " + vertexIds[0]);
                    this.cache.set(vertex, vertexIds[0]);
                }
            }
            return IteratorUtils.of(new BatchVertex(vertexIds[0]));
        }
    }

    @Override
    public Iterator<Edge> edgeIterator(final Object... edgeIds) {
        throw retrievalNotSupported();
    }

    @Override
    public <T extends Traversal<S, S>, S> T of(final Class<T> traversalClass) {
        throw retrievalNotSupported();
    }

    @Override
    public void compute(final Class<? extends GraphComputer> graphComputerClass) {
        throw Exceptions.graphComputerNotSupported();
    }

    @Override
    public GraphComputer compute() {
        throw Exceptions.graphComputerNotSupported();
    }

    @Override
    public TraversalEngine engine() {
        return StandardTraversalEngine.standard;
    }

    @Override
    public void engine(final TraversalEngine traversalEngine) {

    }

    @Override
    public Transaction tx() {
        return this.batchTransaction;
    }

    @Override
    public Variables variables() {
        throw Exceptions.variablesNotSupported();
    }

    @Override
    public Configuration configuration() {
        return new BaseConfiguration();
    }

    @Override
    public Features features() {
        return this.batchFeatures;
    }

    @Override
    public void close() throws Exception {
        baseGraph.close();

        // call reset after the close in case the close behavior fails
        reset();
    }

    private void reset() {
        currentEdge = null;
        currentEdgeCached = null;
        remainingBufferSize = 0;
    }

    public static <T extends Graph> Builder build(final T g) {
        return new Builder<>(g);
    }

    private class BatchTransaction implements Transaction {
        private final boolean supportsTx;

        public BatchTransaction() {
            supportsTx = baseGraph.features().graph().supportsTransactions();
        }

        @Override
        public Transaction onClose(final Consumer<Transaction> consumer) {
            throw new UnsupportedOperationException(
                    "Transaction behavior cannot be altered in batch mode - set the behavior on the base graph");
        }

        @Override
        public Transaction onReadWrite(final Consumer<Transaction> consumer) {
            throw new UnsupportedOperationException(
                    "Transaction behavior cannot be altered in batch mode - set the behavior on the base graph");
        }

        @Override
        public void close() {
            if (supportsTx)
                baseGraph.tx().close();

            // call reset after the close in case the close behavior fails
            reset();
        }

        @Override
        public void readWrite() {
            if (supportsTx)
                baseGraph.tx().readWrite();
        }

        @Override
        public boolean isOpen() {
            return !supportsTx || baseGraph.tx().isOpen();
        }

        @Override
        public <G extends Graph> G create() {
            throw new UnsupportedOperationException("Cannot start threaded transaction during batch loading");
        }

        @Override
        public <R> Workload<R> submit(final Function<Graph, R> work) {
            throw new UnsupportedOperationException("Cannot submit a workload during batch loading");
        }

        @Override
        public void rollback() {
            throw new UnsupportedOperationException("Cannot issue a rollback during batch loading");
        }

        @Override
        public void commit() {
            if (supportsTx)
                baseGraph.tx().commit();

            // call reset after the close in case the close behavior fails
            reset();
        }

        @Override
        public void open() {
            if (supportsTx)
                baseGraph.tx().open();
        }
    }

    private class BatchVertex implements Vertex, Vertex.Iterators, VertexTraversal {

        private final Object externalID;

        BatchVertex(final Object id) {
            if (id == null)
                throw new IllegalArgumentException("External id may not be null");
            externalID = id;
        }

        @Override
        public Edge addEdge(final String label, final Vertex inVertex, final Object... keyValues) {
            if (!BatchVertex.class.isInstance(inVertex))
                throw new IllegalArgumentException("Given element was not created in this baseGraph");
            nextElement();

            final Vertex ov = getCachedVertex(externalID);
            final Vertex iv = getCachedVertex(inVertex.id());

            previousOutVertexId = externalID; //keep track of the previous out vertex id

            if (!incrementalLoading) {
                final Optional<Object[]> kvs = baseSupportsSuppliedEdgeId && T.id.getAccessor().equals(edgeIdKey)
                        ? Optional.ofNullable(keyValues)
                        : ElementHelper.remove(T.id, keyValues);
                currentEdgeCached = kvs.isPresent() ? ov.addEdge(label, iv, kvs.get()) : ov.addEdge(label, iv);
            } else {
                final Optional<Object> id = ElementHelper.getIdValue(keyValues);
                // if the edgeIdKey is not the Element.ID then append it as a name/value pair.  this will overwrite what
                // is present in that field already
                final Object[] keysVals = id.isPresent() && T.id.getAccessor().equals(edgeIdKey) ? keyValues
                        : id.isPresent() ? ElementHelper.upsert(keyValues, edgeIdKey, id.get()) : keyValues;

                // if the graph doesn't support edge ids or the edge id is not the Element.ID then remove that key
                // value pair as it will foul up insertion (i.e. an exception for graphs that don't support it and the
                // id will become the value of the edge id which might not be expected.
                final Optional<Object[]> kvs = baseSupportsSuppliedEdgeId && T.id.getAccessor().equals(edgeIdKey)
                        ? Optional.ofNullable(keyValues)
                        : ElementHelper.remove(T.id, keysVals);

                if (id.isPresent()) {
                    final Traversal<Edge, Edge> traversal = baseGraph.E().has(edgeIdKey, id.get());
                    if (traversal.hasNext()) {
                        final Edge e = traversal.next();
                        // let the user decide how to handle conflict
                        kvs.ifPresent(keyvals -> existingEdgeStrategy.accept(e, keyvals));
                        currentEdgeCached = e;
                    } else
                        currentEdgeCached = kvs.isPresent() ? ov.addEdge(label, iv, kvs.get())
                                : ov.addEdge(label, iv);
                } else {
                    currentEdgeCached = kvs.isPresent() ? ov.addEdge(label, iv, kvs.get()) : ov.addEdge(label, iv);
                }
            }

            currentEdge = new BatchEdge();

            return currentEdge;
        }

        @Override
        public Object id() {
            return this.externalID;
        }

        @Override
        public Graph graph() {
            return getCachedVertex(externalID).graph();
        }

        @Override
        public String label() {
            return getCachedVertex(externalID).label();
        }

        @Override
        public void remove() {
            throw removalNotSupported();
        }

        @Override
        public Set<String> keys() {
            return getCachedVertex(externalID).keys();
        }

        @Override
        public <V> VertexProperty<V> property(final String key, final V value, final Object... keyValues) {
            return getCachedVertex(externalID).property(key, value, keyValues);
        }

        @Override
        public <V> VertexProperty<V> property(final String key) {
            return getCachedVertex(externalID).property(key);
        }

        @Override
        public <V> VertexProperty<V> property(final String key, final V value) {
            return getCachedVertex(externalID).property(key, value);
        }

        @Override
        public <V> V value(final String key) throws NoSuchElementException {
            return getCachedVertex(externalID).value(key);
        }

        @Override
        public Vertex.Iterators iterators() {
            return this;
        }

        @Override
        public Iterator<Edge> edgeIterator(final Direction direction, final String... edgeLabels) {
            throw retrievalNotSupported();
        }

        @Override
        public Iterator<Vertex> vertexIterator(final Direction direction, final String... labels) {
            throw retrievalNotSupported();
        }

        @Override
        public <V> Iterator<VertexProperty<V>> propertyIterator(final String... propertyKeys) {
            return getCachedVertex(externalID).iterators().propertyIterator(propertyKeys);
        }
    }

    private class BatchEdge implements Edge, Edge.Iterators {

        @Override
        public Graph graph() {
            return getWrappedEdge().graph();
        }

        @Override
        public Object id() {
            return getWrappedEdge().label();
        }

        @Override
        public String label() {
            return getWrappedEdge().label();
        }

        @Override
        public void remove() {
            throw removalNotSupported();
        }

        @Override
        public <V> Property<V> property(final String key) {
            return getWrappedEdge().property(key);
        }

        @Override
        public <V> Property<V> property(final String key, final V value) {
            return getWrappedEdge().property(key, value);
        }

        @Override
        public Set<String> keys() {
            return getWrappedEdge().keys();
        }

        @Override
        public <V> V value(final String key) throws NoSuchElementException {
            return getWrappedEdge().value(key);
        }

        private Edge getWrappedEdge() {
            if (this != currentEdge) {
                throw new UnsupportedOperationException("This edge is no longer in scope");
            }
            return currentEdgeCached;
        }

        @Override
        public Edge.Iterators iterators() {
            return this;
        }

        @Override
        public <V> Iterator<Property<V>> propertyIterator(final String... propertyKeys) {
            return getWrappedEdge().iterators().propertyIterator(propertyKeys);
        }

        @Override
        public Iterator<Vertex> vertexIterator(final Direction direction) {
            return getWrappedEdge().iterators().vertexIterator(direction);
        }
    }

    private static UnsupportedOperationException retrievalNotSupported() {
        return new UnsupportedOperationException("Retrieval operations are not supported during batch loading");
    }

    private static UnsupportedOperationException removalNotSupported() {
        return new UnsupportedOperationException("Removal operations are not supported during batch loading");
    }

    public static class Builder<G extends Graph> {
        private final G graphToLoad;
        private boolean incrementalLoading = false;
        private String vertexIdKey = T.id.getAccessor();
        private String edgeIdKey = T.id.getAccessor();
        private long bufferSize = DEFAULT_BUFFER_SIZE;
        private VertexIdType vertexIdType = VertexIdType.OBJECT;
        private BiConsumer<Element, Object[]> existingVertexStrategy = Exists.IGNORE;
        private BiConsumer<Element, Object[]> existingEdgeStrategy = Exists.IGNORE;

        private Builder(final G g) {
            if (null == g)
                throw new IllegalArgumentException("Graph may not be null");
            if (g instanceof BatchGraph)
                throw new IllegalArgumentException("BatchGraph cannot wrap another BatchGraph instance");
            this.graphToLoad = g;
        }

        /**
         * Sets the key to be used when setting the vertex id as a property on the respective vertex. If this
         * value is not set it defaults to {@link T#id}.
         *
         * @param key Key to be used.
         */
        public Builder vertexIdKey(final String key) {
            if (null == key)
                throw new IllegalArgumentException("Key cannot be null");
            this.vertexIdKey = key;
            return this;
        }

        /**
         * Sets the key to be used when setting the edge id as a property on the respective edge.
         * If the key is null, then no property will be set.
         *
         * @param key Key to be used.
         */
        public Builder edgeIdKey(final String key) {
            if (null == key)
                throw new IllegalArgumentException("Optional value for key cannot be null");
            this.edgeIdKey = key;
            return this;
        }

        /**
         * Number of mutations to perform between calls to {@link org.apache.tinkerpop.gremlin.structure.Transaction#commit}.
         */
        public Builder bufferSize(long bufferSize) {
            if (bufferSize <= 0)
                throw new IllegalArgumentException("BufferSize must be positive");
            this.bufferSize = bufferSize;
            return this;
        }

        /**
         * Sets the type of the id used for the vertex which in turn determines the cache type that is used.
         */
        public Builder vertexIdType(final VertexIdType type) {
            if (null == type)
                throw new IllegalArgumentException("Type may not be null");
            this.vertexIdType = type;
            return this;
        }

        /**
         * Sets whether the graph loaded through this instance of {@link BatchGraph} is loaded from scratch
         * (i.e. the wrapped graph is initially empty) or whether graph is loaded incrementally into an
         * existing graph.
         * <p/>
         * In the former case, BatchGraph does not need to check for the existence of vertices with the wrapped
         * graph but only needs to consult its own cache which can be significantly faster. In the latter case,
         * the cache is checked first but an additional check against the wrapped graph may be necessary if
         * the vertex does not exist.
         * <p/>
         * By default, BatchGraph assumes that the data is loaded from scratch.
         */
        public Builder incrementalLoading(final boolean incrementalLoading) {
            this.incrementalLoading = incrementalLoading;
            return this;
        }

        /**
         * Sets whether the graph loaded through this instance of {@link BatchGraph} is loaded from scratch
         * (i.e. the wrapped graph is initially empty) or whether graph is loaded incrementally into an
         * existing graph.
         * <p/>
         * In the former case, BatchGraph does not need to check for the existence of vertices with the wrapped
         * graph but only needs to consult its own cache which can be significantly faster. In the latter case,
         * the cache is checked first but an additional check against the wrapped graph may be necessary if
         * the vertex does not exist.
         * <p/>
         * By default, BatchGraph assumes that the data is loaded from scratch.
         */
        public Builder incrementalLoading(final boolean incrementalLoading,
                final BiConsumer<Element, Object[]> existingVertexStrategy,
                final BiConsumer<Element, Object[]> existingEdgeStrategy) {
            this.incrementalLoading = incrementalLoading;
            this.existingVertexStrategy = existingVertexStrategy;
            this.existingEdgeStrategy = existingEdgeStrategy;
            return this;
        }

        public BatchGraph<G> create() {
            return new BatchGraph<>(graphToLoad, vertexIdType, bufferSize, vertexIdKey, edgeIdKey,
                    incrementalLoading, this.existingVertexStrategy, this.existingEdgeStrategy);
        }
    }
}