org.apache.beam.runners.core.construction.PipelineTranslation.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.beam.runners.core.construction.PipelineTranslation.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.beam.runners.core.construction;

import static com.google.common.base.Preconditions.checkNotNull;

import com.google.auto.value.AutoValue;
import com.google.common.base.MoreObjects;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import com.google.protobuf.ByteString;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;
import org.apache.beam.runners.core.construction.PTransformTranslation.RawPTransform;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.Pipeline.PipelineVisitor;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.common.runner.v1.RunnerApi;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.runners.AppliedPTransform;
import org.apache.beam.sdk.runners.TransformHierarchy;
import org.apache.beam.sdk.runners.TransformHierarchy.Node;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.display.HasDisplayData;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.PCollectionViews;
import org.apache.beam.sdk.values.PInput;
import org.apache.beam.sdk.values.POutput;
import org.apache.beam.sdk.values.PValue;
import org.apache.beam.sdk.values.TupleTag;

/** Utilities for going to/from Runner API pipelines. */
public class PipelineTranslation {

    public static RunnerApi.Pipeline toProto(final Pipeline pipeline) {
        final SdkComponents components = SdkComponents.create();
        final Collection<String> rootIds = new HashSet<>();
        pipeline.traverseTopologically(new PipelineVisitor.Defaults() {
            private final ListMultimap<Node, AppliedPTransform<?, ?, ?>> children = ArrayListMultimap.create();

            @Override
            public void leaveCompositeTransform(Node node) {
                if (node.isRootNode()) {
                    for (AppliedPTransform<?, ?, ?> pipelineRoot : children.get(node)) {
                        rootIds.add(components.getExistingPTransformId(pipelineRoot));
                    }
                } else {
                    // TODO: Include DisplayData in the proto
                    children.put(node.getEnclosingNode(), node.toAppliedPTransform(pipeline));
                    try {
                        components.registerPTransform(node.toAppliedPTransform(pipeline), children.get(node));
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                }
            }

            @Override
            public void visitPrimitiveTransform(Node node) {
                // TODO: Include DisplayData in the proto
                children.put(node.getEnclosingNode(), node.toAppliedPTransform(pipeline));
                try {
                    components.registerPTransform(node.toAppliedPTransform(pipeline),
                            Collections.<AppliedPTransform<?, ?, ?>>emptyList());
                } catch (IOException e) {
                    throw new IllegalStateException(e);
                }
            }
        });
        return RunnerApi.Pipeline.newBuilder().setComponents(components.toComponents())
                .addAllRootTransformIds(rootIds).build();
    }

    private static DisplayData evaluateDisplayData(HasDisplayData component) {
        return DisplayData.from(component);
    }

    public static Pipeline fromProto(final RunnerApi.Pipeline pipelineProto) throws IOException {
        TransformHierarchy transforms = new TransformHierarchy();
        Pipeline pipeline = Pipeline.forTransformHierarchy(transforms, PipelineOptionsFactory.create());

        // Keeping the PCollections straight is a semantic necessity, but being careful not to explode
        // the number of coders and windowing strategies is also nice, and helps testing.
        RehydratedComponents rehydratedComponents = RehydratedComponents
                .forComponents(pipelineProto.getComponents()).withPipeline(pipeline);

        for (String rootId : pipelineProto.getRootTransformIdsList()) {
            addRehydratedTransform(transforms, pipelineProto.getComponents().getTransformsOrThrow(rootId), pipeline,
                    pipelineProto.getComponents().getTransformsMap(), rehydratedComponents);
        }

        return pipeline;
    }

    private static void addRehydratedTransform(TransformHierarchy transforms, RunnerApi.PTransform transformProto,
            Pipeline pipeline, Map<String, RunnerApi.PTransform> transformProtos,
            RehydratedComponents rehydratedComponents) throws IOException {

        Map<TupleTag<?>, PValue> rehydratedInputs = new HashMap<>();
        for (Map.Entry<String, String> inputEntry : transformProto.getInputsMap().entrySet()) {
            rehydratedInputs.put(new TupleTag<>(inputEntry.getKey()),
                    rehydratedComponents.getPCollection(inputEntry.getValue()));
        }

        Map<TupleTag<?>, PValue> rehydratedOutputs = new HashMap<>();
        for (Map.Entry<String, String> outputEntry : transformProto.getOutputsMap().entrySet()) {
            rehydratedOutputs.put(new TupleTag<>(outputEntry.getKey()),
                    rehydratedComponents.getPCollection(outputEntry.getValue()));
        }

        RunnerApi.FunctionSpec transformSpec = transformProto.getSpec();

        // By default, no "additional" inputs, since that is an SDK-specific thing.
        // Only ParDo and WriteFiles really separate main from side inputs
        Map<TupleTag<?>, PValue> additionalInputs = Collections.emptyMap();

        // TODO: ParDoTranslation should own it - https://issues.apache.org/jira/browse/BEAM-2674
        if (transformSpec.getUrn().equals(PTransformTranslation.PAR_DO_TRANSFORM_URN)) {
            RunnerApi.ParDoPayload payload = RunnerApi.ParDoPayload.parseFrom(transformSpec.getPayload());
            additionalInputs = sideInputMapToAdditionalInputs(transformProto, rehydratedComponents,
                    rehydratedInputs, payload.getSideInputsMap());
        }

        // TODO: WriteFilesTranslation should own it - https://issues.apache.org/jira/browse/BEAM-2674
        if (transformSpec.getUrn().equals(PTransformTranslation.WRITE_FILES_TRANSFORM_URN)) {
            RunnerApi.WriteFilesPayload payload = RunnerApi.WriteFilesPayload.parseFrom(transformSpec.getPayload());
            additionalInputs = sideInputMapToAdditionalInputs(transformProto, rehydratedComponents,
                    rehydratedInputs, payload.getSideInputsMap());
        }

        // TODO: CombineTranslator should own it - https://issues.apache.org/jira/browse/BEAM-2674
        List<Coder<?>> additionalCoders = Collections.emptyList();
        if (transformSpec.getUrn().equals(PTransformTranslation.COMBINE_TRANSFORM_URN)) {
            RunnerApi.CombinePayload payload = RunnerApi.CombinePayload.parseFrom(transformSpec.getPayload());
            additionalCoders = (List) Collections
                    .singletonList(rehydratedComponents.getCoder(payload.getAccumulatorCoderId()));
        }

        RehydratedPTransform transform = RehydratedPTransform.of(transformSpec.getUrn(), transformSpec.getPayload(),
                additionalInputs, additionalCoders);

        if (isPrimitive(transformProto)) {
            transforms.addFinalizedPrimitiveNode(transformProto.getUniqueName(), rehydratedInputs, transform,
                    rehydratedOutputs);
        } else {
            transforms.pushFinalizedNode(transformProto.getUniqueName(), rehydratedInputs, transform,
                    rehydratedOutputs);

            for (String childTransformId : transformProto.getSubtransformsList()) {
                addRehydratedTransform(transforms, transformProtos.get(childTransformId), pipeline, transformProtos,
                        rehydratedComponents);
            }

            transforms.popNode();
        }
    }

    private static Map<TupleTag<?>, PValue> sideInputMapToAdditionalInputs(RunnerApi.PTransform transformProto,
            RehydratedComponents rehydratedComponents, Map<TupleTag<?>, PValue> rehydratedInputs,
            Map<String, RunnerApi.SideInput> sideInputsMap) throws IOException {
        List<PCollectionView<?>> views = new ArrayList<>();
        for (Map.Entry<String, RunnerApi.SideInput> sideInputEntry : sideInputsMap.entrySet()) {
            String localName = sideInputEntry.getKey();
            RunnerApi.SideInput sideInput = sideInputEntry.getValue();
            PCollection<?> pCollection = (PCollection<?>) checkNotNull(
                    rehydratedInputs.get(new TupleTag<>(localName)));
            views.add(ParDoTranslation.viewFromProto(sideInput, localName, pCollection, transformProto,
                    rehydratedComponents));
        }
        return PCollectionViews.toAdditionalInputs(views);
    }

    // A primitive transform is one with outputs that are not in its input and also
    // not produced by a subtransform.
    private static boolean isPrimitive(RunnerApi.PTransform transformProto) {
        return transformProto.getSubtransformsCount() == 0
                && !transformProto.getInputsMap().values().containsAll(transformProto.getOutputsMap().values());
    }

    @AutoValue
    abstract static class RehydratedPTransform extends RawPTransform<PInput, POutput> {

        @Nullable
        public abstract String getUrn();

        @Nullable
        public abstract ByteString getPayload();

        @Override
        public abstract Map<TupleTag<?>, PValue> getAdditionalInputs();

        public abstract List<Coder<?>> getCoders();

        public static RehydratedPTransform of(String urn, ByteString payload,
                Map<TupleTag<?>, PValue> additionalInputs, List<Coder<?>> additionalCoders) {
            return new AutoValue_PipelineTranslation_RehydratedPTransform(urn, payload, additionalInputs,
                    additionalCoders);
        }

        @Override
        public POutput expand(PInput input) {
            throw new IllegalStateException(String.format(
                    "%s should never be asked to expand;"
                            + " it is the result of deserializing an already-constructed Pipeline",
                    getClass().getSimpleName()));
        }

        @Override
        public String toString() {
            return MoreObjects.toStringHelper(this).add("urn", getUrn()).add("payload", getPayload()).toString();
        }

        @Override
        public void registerComponents(SdkComponents components) {
            for (Coder<?> coder : getCoders()) {
                try {
                    components.registerCoder(coder);
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
        }
    }
}