Java tutorial
/******************************************************************************* * Copyright 2015, The IKANOW Open Source Project. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package com.ikanow.aleph2.analytics.storm.utils; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.InputStream; import java.util.Collection; import java.util.Date; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; import org.apache.commons.io.FileUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.yaml.snakeyaml.Yaml; import backtype.storm.generated.AlreadyAliveException; import backtype.storm.generated.ClusterSummary; import backtype.storm.generated.StormTopology; import backtype.storm.generated.TopologyInfo; import com.google.common.collect.Sets; import com.ikanow.aleph2.analytics.storm.data_model.IStormController; import com.ikanow.aleph2.analytics.storm.services.LocalStormController; import com.ikanow.aleph2.analytics.storm.services.RemoteStormController; import com.ikanow.aleph2.core.shared.utils.JarBuilderUtil; import com.ikanow.aleph2.core.shared.utils.LiveInjector; import com.ikanow.aleph2.data_model.objects.data_import.DataBucketBean; import com.ikanow.aleph2.data_model.objects.shared.BasicMessageBean; import com.ikanow.aleph2.data_model.objects.shared.GlobalPropertiesBean; import com.ikanow.aleph2.data_model.utils.BucketUtils; import com.ikanow.aleph2.data_model.utils.Lambdas; import com.ikanow.aleph2.data_model.utils.ModuleUtils; import com.ikanow.aleph2.data_model.utils.Optionals; import com.ikanow.aleph2.data_model.utils.Tuples; /** * Factory for returning a local or remote storm controller. * * Also contains static functions for using that cluster to perform various actions * * @author Burch * */ public class StormControllerUtil { private static final Logger _logger = LogManager.getLogger(); private final static Set<String> dirs_to_ignore = Sets.newHashSet("org/slf4j", "org/apache/log4j"); protected final static ConcurrentHashMap<String, Date> storm_topology_jars_cache = new ConcurrentHashMap<>(); protected final static long MAX_RETRIES = 60; //60 retries at 1s == 1m max retry time /** * Returns an instance of a local storm controller. * * @return */ public static IStormController getLocalStormController() { return new LocalStormController(); } /** * Returns an instance of a remote storm controller pointed at the given nimbus server * for storm_thrift_transport_plugin we typically use "backtype.storm.security.auth.SimpleTransportPlugin" * * @param nimbus_host * @param nimbus_thrift_port * @param storm_thrift_transport_plugin * @return */ public static IStormController getRemoteStormController(String nimbus_host, int nimbus_thrift_port, String storm_thrift_transport_plugin) { return new RemoteStormController(nimbus_host, nimbus_thrift_port, storm_thrift_transport_plugin); } /** * Returns an instance of a remote storm controller pointed at the given config * * @param config * @return */ public static IStormController getRemoteStormController(Map<String, Object> config) { return new RemoteStormController(config); } /** * Submits a job on given storm cluster. When submitting to a local cluster, input_jar_location * can be null (it won't be used). When submitting remotely it should be the local file path * where the jar to be submitted is located. * * To check the status of a job call getJobStats * * @param storm_controller * @param job_name * @param input_jar_location * @param topology * @throws Exception */ public static void submitJob(IStormController storm_controller, String job_name, String input_jar_location, StormTopology topology, Map<String, Object> config_override) throws Exception { storm_controller.submitJob(job_name, input_jar_location, topology, config_override); } /** * Should stop a job on the storm cluster given the job_name, status of the stop can * be checked via getJobStats * * @param storm_controller * @param job_name * @throws Exception */ public static void stopJob(IStormController storm_controller, String job_name) throws Exception { storm_controller.stopJob(job_name); } /** * Should return the job statistics for a job running on the storm cluster with the given job_name * * @param storm_controller * @param job_name * @return * @throws Exception */ public static TopologyInfo getJobStats(IStormController storm_controller, String job_name) throws Exception { return storm_controller.getJobStats(job_name); } /** * Util function to create a storm jar in a random temp location. This * can be used for creating the jar to submit to submitJob. * * Note: you do not want to pass the storm library to this function if you * intend to use it to submit a storm job, storm jobs cannot contain that library. * * Also note: JarBuilderUtil merges files in order, so if jars_to_merge[0] contains * a file in the same location as jars_to_merge[1], only jars_to_merge[0] will exist in * the final jar. * * @param jars_to_merge * @param jar_location location of the jar to send * @return */ public static boolean buildStormTopologyJar(final Collection<String> jars_to_merge, final String input_jar_location) { try { _logger.debug("creating jar to submit at: " + input_jar_location); //final String input_jar_location = System.getProperty("java.io.tmpdir") + File.separator + UuidUtils.get().getTimeBasedUuid() + ".jar"; JarBuilderUtil.mergeJars(jars_to_merge, input_jar_location, dirs_to_ignore); return true; } catch (Exception e) { _logger.error(ErrorUtils.getLongForm("Error building storm jar {0}", e)); return false; } } /** * Returns a remote storm controller given a yarn config directory. Will look for * the storm config at yarn_config_dir/storm.properties * * @param yarn_config_dir * @return * @throws FileNotFoundException */ public static IStormController getStormControllerFromYarnConfig(String yarn_config_dir) throws FileNotFoundException { Yaml yaml = new Yaml(); InputStream input = new FileInputStream(new File(yarn_config_dir + File.separator + "storm.yaml")); @SuppressWarnings("unchecked") Map<String, Object> object = (Map<String, Object>) yaml.load(input); IStormController storm = getRemoteStormController(object); return storm; } /** * Starts up a storm job. * * 1. gets the storm instance from the yarn config * 2. Makes a mega jar consisting of: * A. Underlying artefacts (system libs) * B. User supplied libraries * 3. Submit megajar to storm with jobname of the bucket id * * @param bucket * @param underlying_artefacts * @param yarn_config_dir * @param user_lib_paths * @param topology * @return */ public static CompletableFuture<BasicMessageBean> startJob(final IStormController storm_controller, final DataBucketBean bucket, final Optional<String> sub_job, final Collection<Object> underlying_artefacts, final Collection<String> user_lib_paths, final StormTopology topology, final Map<String, String> config, final String cached_jar_dir) { if (null == topology) { return CompletableFuture.completedFuture(ErrorUtils.buildErrorMessage(StormControllerUtil.class, "startJob", ErrorUtils.TOPOLOGY_NULL_ERROR, bucket.full_name())); } _logger.info("Retrieved user Storm config topology: spouts=" + topology.get_spouts_size() + " bolts=" + topology.get_bolts_size() + " configs=" + config.toString()); final Set<String> jars_to_merge = new TreeSet<String>(); final CompletableFuture<String> jar_future = Lambdas.get(() -> { if (RemoteStormController.class.isAssignableFrom(storm_controller.getClass())) { // (This is only necessary in the remote case) jars_to_merge.addAll(underlying_artefacts.stream() .map(artefact -> LiveInjector.findPathJar(artefact.getClass(), "")) .filter(f -> !f.equals("")).collect(Collectors.toSet())); if (jars_to_merge.isEmpty()) { // special case: no aleph2 libs found, this is almost certainly because this is being run from eclipse... final GlobalPropertiesBean globals = ModuleUtils.getGlobalProperties(); _logger.warn( "WARNING: no library files found, probably because this is running from an IDE - instead taking all JARs from: " + (globals.local_root_dir() + "/lib/")); try { //... and LiveInjecter doesn't work on classes ... as a backup just copy everything from "<LOCAL_ALEPH2_HOME>/lib" into there jars_to_merge .addAll(FileUtils .listFiles(new File(globals.local_root_dir() + "/lib/"), new String[] { "jar" }, false) .stream().map(File::toString).collect(Collectors.toList())); } catch (Exception e) { throw new RuntimeException("In eclipse/IDE mode, directory not found: " + (globals.local_root_dir() + "/lib/")); } } //add in the user libs jars_to_merge.addAll(user_lib_paths); //create jar return buildOrReturnCachedStormTopologyJar(jars_to_merge, cached_jar_dir); } else { return CompletableFuture.completedFuture("/unused/dummy.jar"); } }); //submit to storm @SuppressWarnings("unchecked") final CompletableFuture<BasicMessageBean> submit_future = Lambdas.get(() -> { long retries = 0; while (retries < MAX_RETRIES) { try { _logger.debug("Trying to submit job, try: " + retries + " of " + MAX_RETRIES); final String jar_file_location = jar_future.get(); return storm_controller.submitJob(bucketPathToTopologyName(bucket, sub_job), jar_file_location, topology, (Map<String, Object>) (Map<String, ?>) config); } catch (Exception ex) { if (ex instanceof AlreadyAliveException) { retries++; //sleep 1s, was seeing about 2s of sleep required before job successfully submitted on restart try { Thread.sleep(1000); } catch (Exception e) { final CompletableFuture<BasicMessageBean> error_future = new CompletableFuture<BasicMessageBean>(); error_future.completeExceptionally(e); return error_future; } } else { retries = MAX_RETRIES; //we threw some other exception, bail out final CompletableFuture<BasicMessageBean> error_future = new CompletableFuture<BasicMessageBean>(); error_future.completeExceptionally(ex); return error_future; } } } //we maxed out our retries, throw failure final CompletableFuture<BasicMessageBean> error_future = new CompletableFuture<BasicMessageBean>(); error_future.completeExceptionally(new Exception( "Error submitting job, ran out of retries (previous (same name) job is probably still alive)")); return error_future; }); return submit_future; } /** * Checks the jar cache to see if an entry already exists for this list of jars, * returns the path of that entry if it does exist, otherwise creates the jar, adds * the path to the cache and returns it. * * @param jars_to_merge * @return * @throws Exception */ public static synchronized CompletableFuture<String> buildOrReturnCachedStormTopologyJar( final Collection<String> jars_to_merge, final String cached_jar_dir) { CompletableFuture<String> future = new CompletableFuture<String>(); final String hashed_jar_name = JarBuilderUtil.getHashedJarName(jars_to_merge, cached_jar_dir); //1. Check cache for this jar via hash of jar names if (storm_topology_jars_cache.containsKey(hashed_jar_name)) { //if exists: //2. validate jars has not been updated Date most_recent_update = JarBuilderUtil.getMostRecentlyUpdatedFile(jars_to_merge); //if the cache is more recent than any of the files, we assume nothing has been updated if (storm_topology_jars_cache.get(hashed_jar_name).getTime() > most_recent_update.getTime()) { //RETURN return cached jar file path _logger.debug("Returning a cached copy of the jar"); //update the cache copy to set its modified time to now so we don't clean it up JarBuilderUtil.updateJarModifiedTime(hashed_jar_name); future.complete(hashed_jar_name); return future; } else { //delete cache copy _logger.debug("Removing an expired cached copy of the jar"); removeCachedJar(hashed_jar_name); } } //if we fall through //3. create jar _logger.debug("Fell through or cache copy is old, have to create a new version"); if (buildStormTopologyJar(jars_to_merge, hashed_jar_name)) { //4. add jar to cache w/ current/newest file timestamp storm_topology_jars_cache.put(hashed_jar_name, new Date()); //RETURN return new jar file path future.complete(hashed_jar_name); } else { //had an error creating jar, throw an exception? future.completeExceptionally(new Exception("Error trying to create storm jar, see logs")); } return future; } /** * Remove the give file from cache and locally if it exists * @param hashed_jar_name */ private static void removeCachedJar(String hashed_jar_name) { storm_topology_jars_cache.remove(hashed_jar_name); File hashed_file = new File(hashed_jar_name); hashed_file.exists(); hashed_file.delete(); } /** * Stops a storm job, uses the bucket.id to try and find the job to stop * * @param bucket * @return */ public static CompletableFuture<BasicMessageBean> stopJob(IStormController storm_controller, DataBucketBean bucket, final Optional<String> sub_job) { CompletableFuture<BasicMessageBean> stop_future = new CompletableFuture<BasicMessageBean>(); try { storm_controller.stopJob(bucketPathToTopologyName(bucket, sub_job)); } catch (Exception ex) { stop_future.complete(ErrorUtils.buildErrorMessage(StormControllerUtil.class, "stopJob", ErrorUtils.getLongForm("Error stopping storm job: {0}", ex))); return stop_future; } stop_future.complete(ErrorUtils.buildSuccessMessage(StormControllerUtil.class, "stopJob", "Stopped storm job succesfully")); return stop_future; } /** * Restarts a storm job by first calling stop, then calling start * * @param bucket * @param underlying_artefacts * @param yarn_config_dir * @param user_lib_paths * @param topology * @return */ public static CompletableFuture<BasicMessageBean> restartJob(final IStormController storm_controller, final DataBucketBean bucket, final Optional<String> sub_job, final Collection<Object> underlying_artefacts, final Collection<String> user_lib_paths, final StormTopology topology, final Map<String, String> config, final String cached_jar_dir) { CompletableFuture<BasicMessageBean> stop_future = stopJob(storm_controller, bucket, sub_job); try { stop_future.get(5, TimeUnit.SECONDS); waitForJobToDie(storm_controller, bucket, sub_job, 30L); } catch (Exception e) { CompletableFuture<BasicMessageBean> error_future = new CompletableFuture<BasicMessageBean>(); error_future.complete(ErrorUtils.buildErrorMessage(StormControllerUtil.class, "restartJob", ErrorUtils.getLongForm("Error stopping storm job: {0}", e))); return error_future; } return startJob(storm_controller, bucket, sub_job, underlying_artefacts, user_lib_paths, topology, config, cached_jar_dir); } /** * Continually checks if job has died, returns true if it has, or throws an exception if * timeout occurs (seconds_to_wait elaspses). * * @param storm_controller * @param bucket * @param l * @return * @throws Exception */ public static void waitForJobToDie(IStormController storm_controller, DataBucketBean bucket, final Optional<String> sub_job, long seconds_to_wait) throws Exception { long start_time = System.currentTimeMillis(); long num_tries = 0; long expire_time = System.currentTimeMillis() + (seconds_to_wait * 1000); while (System.currentTimeMillis() < expire_time) { TopologyInfo info = null; try { info = getJobStats(storm_controller, bucketPathToTopologyName(bucket, sub_job)); } catch (Exception ex) { } if (null == info) { _logger.debug("JOB_STATUS: no longer exists, assuming that job is dead and gone, spent: " + (System.currentTimeMillis() - start_time) + "ms waiting"); return; } num_tries++; _logger.debug("Waiting for job status to go away, try number: " + num_tries); Thread.sleep(2000); //wait 2s between checks, in tests it was taking 8s to clear } } /** * Converts a buckets path to a use-able topology name * 1 way conversion, ie can't convert back * Uses the standard operation: * name1_name2_name3[_<subjob>]__<uuid> * All <>s truncated and normalized * The uuid does not include the sub-job ie can be used to match on all sub-jobs of a job * * @param bucket_path * @return */ public static String bucketPathToTopologyName(final DataBucketBean bucket, Optional<String> sub_job) { return BucketUtils.getUniqueSignature(bucket.full_name(), sub_job); } /** Stops all jobs corresponding to a given bucket * @param storm_controller * @param bucket */ public static void stopAllJobsForBucket(IStormController storm_controller, DataBucketBean bucket) { final List<String> jobs = storm_controller.getJobNamesForBucket(bucket.full_name()); jobs.forEach(job -> { storm_controller.stopJob(job); }); } /** Gets a list of (aleph2-side) names for a given bucket * @param bucket_path * @return */ public static List<String> getJobNamesForBucket(String bucket_path, final ClusterSummary cluster_summary) { final String base_bucket_sig = BucketUtils.getUniqueSignature(bucket_path, Optional.empty()); final String bucket_uuid = Optional.of(base_bucket_sig).map(s -> s.substring(s.lastIndexOf("__"))).get(); final String start_of_bucket_sig = Optional.of(base_bucket_sig) .map(s -> s.substring(0, s.lastIndexOf("__"))).get(); // Has to start with bucket path path and end with the UUID, but the "middle" (ie job name) can be anything return Optionals.streamOf(cluster_summary.get_topologies_iterator(), false) .map(top_summary -> top_summary.get_name()) .filter(top_summary -> top_summary.startsWith(start_of_bucket_sig)) .map(top_summary -> Tuples._2T(top_summary, top_summary.indexOf(bucket_uuid))) .filter(t2 -> (t2._2() > 0)).map(t2 -> t2._1().substring(0, t2._2() + bucket_uuid.length())) .collect(Collectors.toList()); } }