org.apache.beam.runners.dataflow.util.PackageUtil.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.beam.runners.dataflow.util.PackageUtil.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.runners.dataflow.util;

import static com.google.common.base.Preconditions.checkArgument;

import com.fasterxml.jackson.core.Base64Variants;
import com.google.api.client.util.BackOff;
import com.google.api.client.util.Sleeper;
import com.google.api.services.dataflow.model.DataflowPackage;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.common.collect.Lists;
import com.google.common.hash.Funnels;
import com.google.common.hash.Hasher;
import com.google.common.hash.Hashing;
import com.google.common.io.CountingOutputStream;
import com.google.common.io.Files;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import javax.annotation.Nullable;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.fs.CreateOptions;
import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions;
import org.apache.beam.sdk.util.BackOffAdapter;
import org.apache.beam.sdk.util.FluentBackoff;
import org.apache.beam.sdk.util.ZipFiles;
import org.joda.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** Helper routines for packages. */
class PackageUtil {
    private static final Logger LOG = LoggerFactory.getLogger(PackageUtil.class);
    /**
     * A reasonable upper bound on the number of jars required to launch a Dataflow job.
     */
    private static final int SANE_CLASSPATH_SIZE = 1000;

    private static final FluentBackoff BACKOFF_FACTORY = FluentBackoff.DEFAULT.withMaxRetries(4)
            .withInitialBackoff(Duration.standardSeconds(5));

    /**
     * Translates exceptions from API calls.
     */
    private static final ApiErrorExtractor ERROR_EXTRACTOR = new ApiErrorExtractor();

    /**
     * Compute and cache the attributes of a classpath element that we will need to stage it.
     *
     * @param source the file or directory to be staged.
     * @param stagingPath The base location for staged classpath elements.
     * @param overridePackageName If non-null, use the given value as the package name
     *                            instead of generating one automatically.
     * @return a {@link PackageAttributes} that containing metadata about the object to be staged.
     */
    static PackageAttributes createPackageAttributes(File source, String stagingPath,
            @Nullable String overridePackageName) {
        boolean directory = source.isDirectory();

        // Compute size and hash in one pass over file or directory.
        Hasher hasher = Hashing.md5().newHasher();
        OutputStream hashStream = Funnels.asOutputStream(hasher);
        try (CountingOutputStream countingOutputStream = new CountingOutputStream(hashStream)) {
            if (!directory) {
                // Files are staged as-is.
                Files.asByteSource(source).copyTo(countingOutputStream);
            } else {
                // Directories are recursively zipped.
                ZipFiles.zipDirectory(source, countingOutputStream);
            }
            countingOutputStream.flush();

            long size = countingOutputStream.getCount();
            String hash = Base64Variants.MODIFIED_FOR_URL.encode(hasher.hash().asBytes());

            // Create the DataflowPackage with staging name and location.
            String uniqueName = getUniqueContentName(source, hash);
            String resourcePath = FileSystems.matchNewResource(stagingPath, true)
                    .resolve(uniqueName, StandardResolveOptions.RESOLVE_FILE).toString();
            DataflowPackage target = new DataflowPackage();
            target.setName(overridePackageName != null ? overridePackageName : uniqueName);
            target.setLocation(resourcePath);

            return new PackageAttributes(size, hash, directory, target, source.getPath());
        } catch (IOException e) {
            throw new RuntimeException("Package setup failure for " + source, e);
        }
    }

    /** Utility comparator used in uploading packages efficiently. */
    private static class PackageUploadOrder implements Comparator<PackageAttributes> {
        @Override
        public int compare(PackageAttributes o1, PackageAttributes o2) {
            // Smaller size compares high so that bigger packages are uploaded first.
            long sizeDiff = o2.getSize() - o1.getSize();
            if (sizeDiff != 0) {
                // returns sign of long
                return Long.signum(sizeDiff);
            }

            // Otherwise, choose arbitrarily based on hash.
            return o1.getHash().compareTo(o2.getHash());
        }
    }

    /**
     * Utility function that computes sizes and hashes of packages so that we can validate whether
     * they have already been correctly staged.
     */
    private static List<PackageAttributes> computePackageAttributes(Collection<String> classpathElements,
            final String stagingPath, ListeningExecutorService executorService) {
        List<ListenableFuture<PackageAttributes>> futures = new LinkedList<>();
        for (String classpathElement : classpathElements) {
            @Nullable
            String userPackageName = null;
            if (classpathElement.contains("=")) {
                String[] components = classpathElement.split("=", 2);
                userPackageName = components[0];
                classpathElement = components[1];
            }
            @Nullable
            final String packageName = userPackageName;

            final File file = new File(classpathElement);
            if (!file.exists()) {
                LOG.warn("Skipping non-existent classpath element {} that was specified.", classpathElement);
                continue;
            }

            ListenableFuture<PackageAttributes> future = executorService.submit(new Callable<PackageAttributes>() {
                @Override
                public PackageAttributes call() throws Exception {
                    return createPackageAttributes(file, stagingPath, packageName);
                }
            });
            futures.add(future);
        }

        try {
            return Futures.allAsList(futures).get();
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw new RuntimeException("Interrupted while staging packages", e);
        } catch (ExecutionException e) {
            throw new RuntimeException("Error while staging packages", e.getCause());
        }
    }

    private static WritableByteChannel makeWriter(String target, CreateOptions createOptions) throws IOException {
        return FileSystems.create(FileSystems.matchNewResource(target, false), createOptions);
    }

    /**
     * Utility to verify whether a package has already been staged and, if not, copy it to the
     * staging location.
     */
    private static void stageOnePackage(PackageAttributes attributes, AtomicInteger numUploaded,
            AtomicInteger numCached, Sleeper retrySleeper, CreateOptions createOptions) {
        String source = attributes.getSourcePath();
        String target = attributes.getDataflowPackage().getLocation();

        // TODO: Should we attempt to detect the Mime type rather than
        // always using MimeTypes.BINARY?
        try {
            try {
                long remoteLength = FileSystems.matchSingleFileSpec(target).sizeBytes();
                if (remoteLength == attributes.getSize()) {
                    LOG.debug("Skipping classpath element already staged: {} at {}", attributes.getSourcePath(),
                            target);
                    numCached.incrementAndGet();
                    return;
                }
            } catch (FileNotFoundException expected) {
                // If the file doesn't exist, it means we need to upload it.
            }

            // Upload file, retrying on failure.
            BackOff backoff = BackOffAdapter.toGcpBackOff(BACKOFF_FACTORY.backoff());
            while (true) {
                try {
                    LOG.debug("Uploading classpath element {} to {}", source, target);
                    try (WritableByteChannel writer = makeWriter(target, createOptions)) {
                        copyContent(source, writer);
                    }
                    numUploaded.incrementAndGet();
                    break;
                } catch (IOException e) {
                    if (ERROR_EXTRACTOR.accessDenied(e)) {
                        String errorMessage = String
                                .format("Uploaded failed due to permissions error, will NOT retry staging "
                                        + "of classpath %s. Please verify credentials are valid and that you have "
                                        + "write access to %s. Stale credentials can be resolved by executing "
                                        + "'gcloud auth application-default login'.", source, target);
                        LOG.error(errorMessage);
                        throw new IOException(errorMessage, e);
                    }
                    long sleep = backoff.nextBackOffMillis();
                    if (sleep == BackOff.STOP) {
                        // Rethrow last error, to be included as a cause in the catch below.
                        LOG.error("Upload failed, will NOT retry staging of classpath: {}", source, e);
                        throw e;
                    } else {
                        LOG.warn("Upload attempt failed, sleeping before retrying staging of classpath: {}", source,
                                e);
                        retrySleeper.sleep(sleep);
                    }
                }
            }
        } catch (Exception e) {
            throw new RuntimeException("Could not stage classpath element: " + source, e);
        }
    }

    /**
     * Transfers the classpath elements to the staging location.
     *
     * @param classpathElements The elements to stage.
     * @param stagingPath The base location to stage the elements to.
     * @return A list of cloud workflow packages, each representing a classpath element.
     */
    static List<DataflowPackage> stageClasspathElements(Collection<String> classpathElements, String stagingPath,
            CreateOptions createOptions) {
        ListeningExecutorService executorService = MoreExecutors
                .listeningDecorator(Executors.newFixedThreadPool(32));
        try {
            return stageClasspathElements(classpathElements, stagingPath, Sleeper.DEFAULT, executorService,
                    createOptions);
        } finally {
            executorService.shutdown();
        }
    }

    // Visible for testing.
    static List<DataflowPackage> stageClasspathElements(Collection<String> classpathElements,
            final String stagingPath, final Sleeper retrySleeper, ListeningExecutorService executorService,
            final CreateOptions createOptions) {
        LOG.info("Uploading {} files from PipelineOptions.filesToStage to staging location to "
                + "prepare for execution.", classpathElements.size());

        if (classpathElements.size() > SANE_CLASSPATH_SIZE) {
            LOG.warn(
                    "Your classpath contains {} elements, which Google Cloud Dataflow automatically "
                            + "copies to all workers. Having this many entries on your classpath may be indicative "
                            + "of an issue in your pipeline. You may want to consider trimming the classpath to "
                            + "necessary dependencies only, using --filesToStage pipeline option to override "
                            + "what files are being staged, or bundling several dependencies into one.",
                    classpathElements.size());
        }

        checkArgument(stagingPath != null,
                "Can't stage classpath elements because no staging location has been provided");

        // Inline a copy here because the inner code returns an immutable list and we want to mutate it.
        List<PackageAttributes> packageAttributes = new LinkedList<>(
                computePackageAttributes(classpathElements, stagingPath, executorService));

        // Compute the returned list of DataflowPackage objects here so that they are returned in the
        // same order as on the classpath.
        List<DataflowPackage> packages = Lists.newArrayListWithExpectedSize(packageAttributes.size());
        for (final PackageAttributes attributes : packageAttributes) {
            packages.add(attributes.getDataflowPackage());
        }

        // Order package attributes in descending size order so that we upload the largest files first.
        Collections.sort(packageAttributes, new PackageUploadOrder());
        final AtomicInteger numUploaded = new AtomicInteger(0);
        final AtomicInteger numCached = new AtomicInteger(0);

        List<ListenableFuture<?>> futures = new LinkedList<>();
        for (final PackageAttributes attributes : packageAttributes) {
            futures.add(executorService.submit(new Runnable() {
                @Override
                public void run() {
                    stageOnePackage(attributes, numUploaded, numCached, retrySleeper, createOptions);
                }
            }));
        }
        try {
            Futures.allAsList(futures).get();
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw new RuntimeException("Interrupted while staging packages", e);
        } catch (ExecutionException e) {
            throw new RuntimeException("Error while staging packages", e.getCause());
        }

        LOG.info("Staging files complete: {} files cached, {} files newly uploaded", numCached.get(),
                numUploaded.get());

        return packages;
    }

    /**
     * Returns a unique name for a file with a given content hash.
     *
     * <p>Directory paths are removed. Example:
     * <pre>
     * dir="a/b/c/d", contentHash="f000" => d-f000.jar
     * file="a/b/c/d.txt", contentHash="f000" => d-f000.txt
     * file="a/b/c/d", contentHash="f000" => d-f000
     * </pre>
     */
    static String getUniqueContentName(File classpathElement, String contentHash) {
        String fileName = Files.getNameWithoutExtension(classpathElement.getAbsolutePath());
        String fileExtension = Files.getFileExtension(classpathElement.getAbsolutePath());
        if (classpathElement.isDirectory()) {
            return fileName + "-" + contentHash + ".jar";
        } else if (fileExtension.isEmpty()) {
            return fileName + "-" + contentHash;
        }
        return fileName + "-" + contentHash + "." + fileExtension;
    }

    /**
     * Copies the contents of the classpathElement to the output channel.
     *
     * <p>If the classpathElement is a directory, a Zip stream is constructed on the fly,
     * otherwise the file contents are copied as-is.
     *
     * <p>The output channel is not closed.
     */
    private static void copyContent(String classpathElement, WritableByteChannel outputChannel) throws IOException {
        final File classpathElementFile = new File(classpathElement);
        if (classpathElementFile.isDirectory()) {
            ZipFiles.zipDirectory(classpathElementFile, Channels.newOutputStream(outputChannel));
        } else {
            Files.asByteSource(classpathElementFile).copyTo(Channels.newOutputStream(outputChannel));
        }
    }

    /**
     * Holds the metadata necessary to stage a file or confirm that a staged file has not changed.
     */
    static class PackageAttributes {
        private final boolean directory;
        private final long size;
        private final String hash;
        private final String sourcePath;
        private DataflowPackage dataflowPackage;

        public PackageAttributes(long size, String hash, boolean directory, DataflowPackage dataflowPackage,
                String sourcePath) {
            this.size = size;
            this.hash = Objects.requireNonNull(hash, "hash");
            this.directory = directory;
            this.sourcePath = Objects.requireNonNull(sourcePath, "sourcePath");
            this.dataflowPackage = Objects.requireNonNull(dataflowPackage, "dataflowPackage");
        }

        /**
         * @return the dataflowPackage
         */
        public DataflowPackage getDataflowPackage() {
            return dataflowPackage;
        }

        /**
         * @return the directory
         */
        public boolean isDirectory() {
            return directory;
        }

        /**
         * @return the size
         */
        public long getSize() {
            return size;
        }

        /**
         * @return the hash
         */
        public String getHash() {
            return hash;
        }

        /**
         * @return the file to be uploaded
         */
        public String getSourcePath() {
            return sourcePath;
        }
    }
}