Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.util; import com.google.api.client.util.BackOffUtils; import com.google.api.client.util.Sleeper; import com.google.api.services.dataflow.model.DataflowPackage; import com.google.cloud.hadoop.util.ApiErrorExtractor; import com.google.common.hash.Funnels; import com.google.common.hash.Hasher; import com.google.common.hash.Hashing; import com.google.common.io.CountingOutputStream; import com.google.common.io.Files; import com.fasterxml.jackson.core.Base64Variants; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.OutputStream; import java.nio.channels.Channels; import java.nio.channels.WritableByteChannel; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Objects; /** Helper routines for packages. */ public class PackageUtil { private static final Logger LOG = LoggerFactory.getLogger(PackageUtil.class); /** * A reasonable upper bound on the number of jars required to launch a Dataflow job. */ public static final int SANE_CLASSPATH_SIZE = 1000; /** * The initial interval to use between package staging attempts. */ private static final long INITIAL_BACKOFF_INTERVAL_MS = 5000L; /** * The maximum number of attempts when staging a file. */ private static final int MAX_ATTEMPTS = 5; /** * Translates exceptions from API calls. */ private static final ApiErrorExtractor ERROR_EXTRACTOR = new ApiErrorExtractor(); /** * Creates a DataflowPackage containing information about how a classpath element should be * staged, including the staging destination as well as its size and hash. * * @param classpathElement The local path for the classpath element. * @param stagingPath The base location for staged classpath elements. * @param overridePackageName If non-null, use the given value as the package name * instead of generating one automatically. * @return The package. */ @Deprecated public static DataflowPackage createPackage(File classpathElement, String stagingPath, String overridePackageName) { return createPackageAttributes(classpathElement, stagingPath, overridePackageName).getDataflowPackage(); } /** * Compute and cache the attributes of a classpath element that we will need to stage it. * * @param classpathElement the file or directory to be staged. * @param stagingPath The base location for staged classpath elements. * @param overridePackageName If non-null, use the given value as the package name * instead of generating one automatically. * @return a {@link PackageAttributes} that containing metadata about the object to be staged. */ static PackageAttributes createPackageAttributes(File classpathElement, String stagingPath, String overridePackageName) { try { boolean directory = classpathElement.isDirectory(); // Compute size and hash in one pass over file or directory. Hasher hasher = Hashing.md5().newHasher(); OutputStream hashStream = Funnels.asOutputStream(hasher); CountingOutputStream countingOutputStream = new CountingOutputStream(hashStream); if (!directory) { // Files are staged as-is. Files.asByteSource(classpathElement).copyTo(countingOutputStream); } else { // Directories are recursively zipped. ZipFiles.zipDirectory(classpathElement, countingOutputStream); } long size = countingOutputStream.getCount(); String hash = Base64Variants.MODIFIED_FOR_URL.encode(hasher.hash().asBytes()); // Create the DataflowPackage with staging name and location. String uniqueName = getUniqueContentName(classpathElement, hash); String resourcePath = IOChannelUtils.resolve(stagingPath, uniqueName); DataflowPackage target = new DataflowPackage(); target.setName(overridePackageName != null ? overridePackageName : uniqueName); target.setLocation(resourcePath); return new PackageAttributes(size, hash, directory, target); } catch (IOException e) { throw new RuntimeException("Package setup failure for " + classpathElement, e); } } /** * Transfers the classpath elements to the staging location. * * @param classpathElements The elements to stage. * @param stagingPath The base location to stage the elements to. * @return A list of cloud workflow packages, each representing a classpath element. */ public static List<DataflowPackage> stageClasspathElements(Collection<String> classpathElements, String stagingPath) { return stageClasspathElements(classpathElements, stagingPath, Sleeper.DEFAULT); } // Visible for testing. static List<DataflowPackage> stageClasspathElements(Collection<String> classpathElements, String stagingPath, Sleeper retrySleeper) { LOG.info("Uploading {} files from PipelineOptions.filesToStage to staging location to " + "prepare for execution.", classpathElements.size()); if (classpathElements.size() > SANE_CLASSPATH_SIZE) { LOG.warn( "Your classpath contains {} elements, which Google Cloud Dataflow automatically " + "copies to all workers. Having this many entries on your classpath may be indicative " + "of an issue in your pipeline. You may want to consider trimming the classpath to " + "necessary dependencies only, using --filesToStage pipeline option to override " + "what files are being staged, or bundling several dependencies into one.", classpathElements.size()); } ArrayList<DataflowPackage> packages = new ArrayList<>(); if (stagingPath == null) { throw new IllegalArgumentException( "Can't stage classpath elements on because no staging location has been provided"); } int numUploaded = 0; int numCached = 0; for (String classpathElement : classpathElements) { String packageName = null; if (classpathElement.contains("=")) { String[] components = classpathElement.split("=", 2); packageName = components[0]; classpathElement = components[1]; } File file = new File(classpathElement); if (!file.exists()) { LOG.warn("Skipping non-existent classpath element {} that was specified.", classpathElement); continue; } PackageAttributes attributes = createPackageAttributes(file, stagingPath, packageName); DataflowPackage workflowPackage = attributes.getDataflowPackage(); packages.add(workflowPackage); String target = workflowPackage.getLocation(); // TODO: Should we attempt to detect the Mime type rather than // always using MimeTypes.BINARY? try { try { long remoteLength = IOChannelUtils.getSizeBytes(target); if (remoteLength == attributes.getSize()) { LOG.debug("Skipping classpath element already staged: {} at {}", classpathElement, target); numCached++; continue; } } catch (FileNotFoundException expected) { // If the file doesn't exist, it means we need to upload it. } // Upload file, retrying on failure. AttemptBoundedExponentialBackOff backoff = new AttemptBoundedExponentialBackOff(MAX_ATTEMPTS, INITIAL_BACKOFF_INTERVAL_MS); while (true) { try { LOG.debug("Uploading classpath element {} to {}", classpathElement, target); try (WritableByteChannel writer = IOChannelUtils.create(target, MimeTypes.BINARY)) { copyContent(classpathElement, writer); } numUploaded++; break; } catch (IOException e) { if (ERROR_EXTRACTOR.accessDenied(e)) { String errorMessage = String .format("Uploaded failed due to permissions error, will NOT retry staging " + "of classpath %s. Please verify credentials are valid and that you have " + "write access to %s. Stale credentials can be resolved by executing " + "'gcloud auth login'.", classpathElement, target); LOG.error(errorMessage); throw new IOException(errorMessage, e); } else if (!backoff.atMaxAttempts()) { LOG.warn("Upload attempt failed, sleeping before retrying staging of classpath: {}", classpathElement, e); BackOffUtils.next(retrySleeper, backoff); } else { // Rethrow last error, to be included as a cause in the catch below. LOG.error("Upload failed, will NOT retry staging of classpath: {}", classpathElement, e); throw e; } } } } catch (Exception e) { throw new RuntimeException("Could not stage classpath element: " + classpathElement, e); } } LOG.info("Uploading PipelineOptions.filesToStage complete: {} files newly uploaded, " + "{} files cached", numUploaded, numCached); return packages; } /** * Returns a unique name for a file with a given content hash. * * <p>Directory paths are removed. Example: * <pre> * dir="a/b/c/d", contentHash="f000" => d-f000.jar * file="a/b/c/d.txt", contentHash="f000" => d-f000.txt * file="a/b/c/d", contentHash="f000" => d-f000 * </pre> */ static String getUniqueContentName(File classpathElement, String contentHash) { String fileName = Files.getNameWithoutExtension(classpathElement.getAbsolutePath()); String fileExtension = Files.getFileExtension(classpathElement.getAbsolutePath()); if (classpathElement.isDirectory()) { return fileName + "-" + contentHash + ".jar"; } else if (fileExtension.isEmpty()) { return fileName + "-" + contentHash; } return fileName + "-" + contentHash + "." + fileExtension; } /** * Copies the contents of the classpathElement to the output channel. * * <p>If the classpathElement is a directory, a Zip stream is constructed on the fly, * otherwise the file contents are copied as-is. * * <p>The output channel is not closed. */ private static void copyContent(String classpathElement, WritableByteChannel outputChannel) throws IOException { final File classpathElementFile = new File(classpathElement); if (classpathElementFile.isDirectory()) { ZipFiles.zipDirectory(classpathElementFile, Channels.newOutputStream(outputChannel)); } else { Files.asByteSource(classpathElementFile).copyTo(Channels.newOutputStream(outputChannel)); } } /** * Holds the metadata necessary to stage a file or confirm that a staged file has not changed. */ static class PackageAttributes { private final boolean directory; private final long size; private final String hash; private DataflowPackage dataflowPackage; public PackageAttributes(long size, String hash, boolean directory, DataflowPackage dataflowPackage) { this.size = size; this.hash = Objects.requireNonNull(hash, "hash"); this.directory = directory; this.dataflowPackage = Objects.requireNonNull(dataflowPackage, "dataflowPackage"); } /** * @return the dataflowPackage */ public DataflowPackage getDataflowPackage() { return dataflowPackage; } /** * @return the directory */ public boolean isDirectory() { return directory; } /** * @return the size */ public long getSize() { return size; } /** * @return the hash */ public String getHash() { return hash; } } }