Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.crunch.util; import java.io.File; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.net.URI; import java.net.URL; import java.net.URLDecoder; import java.util.Enumeration; import org.apache.crunch.CrunchRuntimeException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; /** * Provides functions for working with Hadoop's distributed cache. These * include: * <ul> * <li> * Functions for working with a job-specific distributed cache of objects, like * the serialized runtime nodes in a MapReduce.</li> * <li> * Functions for adding library jars to the distributed cache, which will be * added to the classpath of MapReduce tasks.</li> * </ul> */ public class DistCache { // Configuration key holding the paths of jars to export to the distributed // cache. private static final String TMPJARS_KEY = "tmpjars"; public static void write(Configuration conf, Path path, Object value) throws IOException { ObjectOutputStream oos = new ObjectOutputStream(path.getFileSystem(conf).create(path)); oos.writeObject(value); oos.close(); DistributedCache.addCacheFile(path.toUri(), conf); } public static Object read(Configuration conf, Path path) throws IOException { URI target = null; for (URI uri : DistributedCache.getCacheFiles(conf)) { if (uri.toString().equals(path.toString())) { target = uri; break; } } Object value = null; if (target != null) { Path targetPath = new Path(target.toString()); ObjectInputStream ois = new ObjectInputStream(targetPath.getFileSystem(conf).open(targetPath)); try { value = ois.readObject(); } catch (ClassNotFoundException e) { throw new CrunchRuntimeException(e); } ois.close(); } return value; } public static void addCacheFile(Path path, Configuration conf) { DistributedCache.addCacheFile(path.toUri(), conf); } public static Path getPathToCacheFile(Path path, Configuration conf) { try { for (Path localPath : DistributedCache.getLocalCacheFiles(conf)) { if (localPath.toString().endsWith(path.getName())) { return localPath.makeQualified(FileSystem.getLocal(conf)); } } } catch (IOException e) { throw new CrunchRuntimeException(e); } return null; } /** * Adds the specified jar to the distributed cache of jobs using the provided * configuration. The jar will be placed on the classpath of tasks run by the * job. * * @param conf * The configuration used to add the jar to the distributed cache. * @param jarFile * The jar file to add to the distributed cache. * @throws IOException * If the jar file does not exist or there is a problem accessing * the file. */ public static void addJarToDistributedCache(Configuration conf, File jarFile) throws IOException { if (!jarFile.exists()) { throw new IOException("Jar file: " + jarFile.getCanonicalPath() + " does not exist."); } if (!jarFile.getName().endsWith(".jar")) { throw new IllegalArgumentException("File: " + jarFile.getCanonicalPath() + " is not a .jar " + "file."); } // Get a qualified path for the jar. FileSystem fileSystem = FileSystem.getLocal(conf); Path jarPath = new Path(jarFile.getCanonicalPath()); String qualifiedPath = jarPath.makeQualified(fileSystem).toString(); // Add the jar to the configuration variable. String jarConfiguration = conf.get(TMPJARS_KEY, ""); if (!jarConfiguration.isEmpty()) { jarConfiguration += ","; } jarConfiguration += qualifiedPath; conf.set(TMPJARS_KEY, jarConfiguration); } /** * Adds the jar at the specified path to the distributed cache of jobs using * the provided configuration. The jar will be placed on the classpath of * tasks run by the job. * * @param conf * The configuration used to add the jar to the distributed cache. * @param jarFile * The path to the jar file to add to the distributed cache. * @throws IOException * If the jar file does not exist or there is a problem accessing * the file. */ public static void addJarToDistributedCache(Configuration conf, String jarFile) throws IOException { addJarToDistributedCache(conf, new File(jarFile)); } /** * Finds the path to a jar that contains the class provided, if any. There is * no guarantee that the jar returned will be the first on the classpath to * contain the file. This method is basically lifted out of Hadoop's * {@link org.apache.hadoop.mapred.JobConf} class. * * @param jarClass * The class the jar file should contain. * @return The path to a jar file that contains the class, or * <code>null</code> if no such jar exists. * @throws IOException * If there is a problem searching for the jar file. */ public static String findContainingJar(Class<?> jarClass) throws IOException { ClassLoader loader = jarClass.getClassLoader(); String classFile = jarClass.getName().replaceAll("\\.", "/") + ".class"; for (Enumeration<URL> itr = loader.getResources(classFile); itr.hasMoreElements();) { URL url = itr.nextElement(); if ("jar".equals(url.getProtocol())) { String toReturn = url.getPath(); if (toReturn.startsWith("file:")) { toReturn = toReturn.substring("file:".length()); } // URLDecoder is a misnamed class, since it actually decodes // x-www-form-urlencoded MIME type rather than actual // URL encoding (which the file path has). Therefore it would // decode +s to ' 's which is incorrect (spaces are actually // either unencoded or encoded as "%20"). Replace +s first, so // that they are kept sacred during the decoding process. toReturn = toReturn.replaceAll("\\+", "%2B"); toReturn = URLDecoder.decode(toReturn, "UTF-8"); return toReturn.replaceAll("!.*$", ""); } } return null; } /** * Adds all jars under the specified directory to the distributed cache of * jobs using the provided configuration. The jars will be placed on the * classpath of tasks run by the job. This method does not descend into * subdirectories when adding jars. * * @param conf * The configuration used to add jars to the distributed cache. * @param jarDirectory * A directory containing jar files to add to the distributed cache. * @throws IOException * If the directory does not exist or there is a problem accessing * the directory. */ public static void addJarDirToDistributedCache(Configuration conf, File jarDirectory) throws IOException { if (!jarDirectory.exists() || !jarDirectory.isDirectory()) { throw new IOException("Jar directory: " + jarDirectory.getCanonicalPath() + " does not " + "exist or is not a directory."); } for (File file : jarDirectory.listFiles()) { if (!file.isDirectory() && file.getName().endsWith(".jar")) { addJarToDistributedCache(conf, file); } } } /** * Adds all jars under the directory at the specified path to the distributed * cache of jobs using the provided configuration. The jars will be placed on * the classpath of the tasks run by the job. This method does not descend * into subdirectories when adding jars. * * @param conf * The configuration used to add jars to the distributed cache. * @param jarDirectory * The path to a directory containing jar files to add to the * distributed cache. * @throws IOException * If the directory does not exist or there is a problem accessing * the directory. */ public static void addJarDirToDistributedCache(Configuration conf, String jarDirectory) throws IOException { addJarDirToDistributedCache(conf, new File(jarDirectory)); } }