Java tutorial
/** * Copyright 2015 Cerner Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kitesdk.apps.spi.oozie; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Maps; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.joda.time.DateTimeZone; import org.joda.time.Instant; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.kitesdk.apps.AppContext; import org.kitesdk.apps.DataIn; import org.kitesdk.apps.DataOut; import org.kitesdk.apps.scheduled.Schedule; import org.codehaus.plexus.util.WriterFactory; import org.codehaus.plexus.util.xml.PrettyPrintXMLWriter; import org.codehaus.plexus.util.xml.XMLWriter; import org.codehaus.plexus.util.xml.XmlStreamWriter; import org.kitesdk.apps.spi.jobs.JobManagers; import org.kitesdk.apps.spi.jobs.SchedulableJobManager; import org.kitesdk.data.Datasets; import org.kitesdk.data.View; import java.io.IOException; import java.io.OutputStream; import java.net.URI; import java.util.Collection; import java.util.List; import java.util.Map; /** * Support for creating and using Oozie workflows. */ public class OozieScheduling { static final String OOZIE_COORD_NS = "uri:oozie:coordinator:0.4"; static final String OOZIE_WORKFLOW_NS = "uri:oozie:workflow:0.5"; static final String OOZIE_BUNDLE_NS = "uri:oozie:bundle:0.2"; static final String OOZIE_SPARK_NS = "uri:oozie:spark-action:0.1"; private static final String WORKFLOW_ELEMENT = "workflow-app"; private static final String COORDINATOR_ELEMENT = "coordinator-app"; private static final String BUNDLE_ELEMENT = "bundle-app"; private static final DateTimeFormatter formatter = DateTimeFormat.forPattern("yyyy-MM-dd'T'HH:mm'Z'") .withZone(DateTimeZone.UTC); private static final String COORD_NOMINAL_TIME = "coordNominalTime"; private static final String WORKFLOW_NOMINAL_TIME = "workflowNominalTime"; private static final String HIVE_METASTORE_URIS = "hive.metastore.uris"; /** * Relative path to where Oozie workflows are stored. */ private static final String WORKFLOW_DIR = "oozie/workflows"; /** * Relative path to where Oozie coordinators are stored. */ private static final String COORD_DIR = "oozie/coordinators"; /** * Returns the relative path for the workflow generated for the schedule. */ public static String workflowPath(Schedule schedule) { return WORKFLOW_DIR + "/" + schedule.getName(); } /** * Returns the relative path for the coordinator generated for the schedule. */ public static String coordPath(Schedule schedule) { return COORD_DIR + "/" + schedule.getName(); } public static Instant getNominalTime(Configuration conf) { String timeString = conf.get(WORKFLOW_NOMINAL_TIME); if (timeString == null) { return null; } return new Instant(formatter.parseMillis(timeString)); } public static void element(XMLWriter writer, String name, String attributeName, String attributeValue) { writer.startElement(name); writer.addAttribute(attributeName, attributeValue); writer.endElement(); } public static void element(XMLWriter writer, String name, String text) { writer.startElement(name); writer.writeText(text); writer.endElement(); } /** * Generates an Oozie workflow to execute the job in the given schedule. * * @param schedule the schedule to write * @param context the application's context * @param output an output stream to which the workflow is written */ public static void writeWorkFlow(Schedule schedule, AppContext context, OutputStream output) throws IOException { XmlStreamWriter streamWriter = WriterFactory.newXmlWriter(output); PrettyPrintXMLWriter writer = new PrettyPrintXMLWriter(streamWriter); writer.startElement(WORKFLOW_ELEMENT); writer.addAttribute("name", schedule.getName()); writer.addAttribute("xmlns", OOZIE_WORKFLOW_NS); element(writer, "start", "to", schedule.getName()); writer.startElement("action"); writer.addAttribute("name", schedule.getName()); // Reduce retry attempts. TODO: make this configurable? writer.addAttribute("retry-max", "2"); writer.addAttribute("retry-interval", "1"); // Write the appropriate action to be used in the job. SchedulableJobManager manager = JobManagers.createSchedulable(schedule.getJobClass(), context); manager.writeOozieActionBlock(writer, schedule); element(writer, "ok", "to", "end"); element(writer, "error", "to", "kill"); writer.endElement(); // action writer.startElement("kill"); writer.addAttribute("name", "kill"); element(writer, "message", "Error in workflow for " + schedule.getName()); writer.endElement(); // kill element(writer, "end", "name", "end"); writer.endElement(); // workflow streamWriter.flush(); } private static final String toIdentifier(String name) { return name.replace(".", "_"); } /** * Qualifies the given URI template with metastore URI information * from the given configuration, if it doesn't exist. */ @VisibleForTesting static final String qualifyUri(Configuration conf, String uriTemplate) { // Create a URI that doesn't include the initial view: or dataset: prefix // and excludes the parameters. URI baseURI = URI.create(uriTemplate.substring(uriTemplate.indexOf(':') + 1, uriTemplate.indexOf('?'))); // We only need to qualify hive URIs that // are not already qualified with a host, // when there is a Hive metastore URI configured. if (!"hive".equals(baseURI.getScheme()) || baseURI.getHost() != null || conf.get(HIVE_METASTORE_URIS) == null) { return uriTemplate; } String[] uris = conf.get(HIVE_METASTORE_URIS).split(","); if (uris.length == 0) { return uriTemplate; } URI hiveUri = URI.create(uris[0]); String host = hiveUri.getHost(); int port = hiveUri.getPort(); // Recreate a template this is qualified with host and port information. String prefix = uriTemplate.startsWith("dataset") ? "dataset:hive:" : "view:hive:"; String rest = uriTemplate.substring(prefix.length()); return prefix + "//" + host + ":" + port + "/" + rest; } private static final void writeCoordinatorDatasets(XMLWriter writer, Schedule schedule, SchedulableJobManager manager) { writer.startElement("datasets"); for (Map.Entry<String, Schedule.ViewTemplate> entry : schedule.getViewTemplates().entrySet()) { Schedule.ViewTemplate template = entry.getValue(); writer.startElement("dataset"); writer.addAttribute("name", "ds_" + toIdentifier(entry.getKey())); // Write the frequency in Oozie's pre-cron format. This should be removed // when See https://issues.apache.org/jira/browse/OOZIE-1431 is available. writer.addAttribute("frequency", CronConverter.toFrequency(template.getFrequency())); Instant initialTime = CronConverter.nextInstant(template.getFrequency(), new Instant()); writer.addAttribute("initial-instance", formatter.print(initialTime)); writer.addAttribute("timezone", "UTC"); String qualifiedTemplate = qualifyUri(manager.getAppContext().getHadoopConf(), entry.getValue().getUriTemplate()); element(writer, "uri-template", qualifiedTemplate); // Don't createSchedulable a done flag. This may be something we can remove when // when using a URI handler aware of Kite. element(writer, "done-flag", ""); writer.endElement(); // dataset } writer.endElement(); // datasets Collection<DataIn> inputs = manager.getInputs().values(); if (!inputs.isEmpty()) { writer.startElement("input-events"); for (DataIn input : inputs) { writer.startElement("data-in"); writer.addAttribute("name", "datain_" + toIdentifier(input.name())); writer.addAttribute("dataset", "ds_" + toIdentifier(input.name())); element(writer, "instance", "${coord:current(0)}"); writer.endElement(); // data-in } writer.endElement(); // input-events } Collection<DataOut> outputs = manager.getOutputs().values(); if (!outputs.isEmpty()) { writer.startElement("output-events"); for (DataOut output : outputs) { writer.startElement("data-out"); writer.addAttribute("name", "dataout_" + toIdentifier(output.name())); writer.addAttribute("dataset", "ds_" + toIdentifier(output.name())); element(writer, "instance", "${coord:current(0)}"); writer.endElement(); // data-out } writer.endElement(); // output-events } } /** * Generates an Oozie coordinator XML for the given schedule. * * @param schedule the schedule for which a coordinator is to be written * @param manager the manager instance fot the scheduled job * @param output an output stream to which the generated schedule is written */ public static void writeCoordinator(Schedule schedule, SchedulableJobManager manager, OutputStream output) throws IOException { XmlStreamWriter streamWriter = WriterFactory.newXmlWriter(output); PrettyPrintXMLWriter writer = new PrettyPrintXMLWriter(streamWriter); String jobName = schedule.getJobClass().getCanonicalName(); writer.startElement(COORDINATOR_ELEMENT); writer.addAttribute("name", jobName); writer.addAttribute("xmlns", OOZIE_COORD_NS); writer.addAttribute("frequency", schedule.getFrequency()); writer.addAttribute("start", formatter.print(schedule.getStartTime())); writer.addAttribute("end", "3000-01-01T00:00Z"); writer.addAttribute("timezone", "UTC"); writeCoordinatorDatasets(writer, schedule, manager); writer.startElement("action"); writer.startElement("workflow"); element(writer, "app-path", "${kiteAppRoot}/" + workflowPath(schedule)); writer.startElement("configuration"); property(writer, "kiteAppRoot", "${kiteAppRoot}"); property(writer, COORD_NOMINAL_TIME, "${coord:nominalTime()}"); // Include the dataset inputs to make them visible to the workflow. for (DataIn dataIn : manager.getInputs().values()) { property(writer, "coord_" + toIdentifier(dataIn.name()), "${coord:dataIn('datain_" + toIdentifier(dataIn.name()) + "')}"); } for (DataOut dataOut : manager.getOutputs().values()) { property(writer, "coord_" + toIdentifier(dataOut.name()), "${coord:dataOut('dataout_" + toIdentifier(dataOut.name()) + "')}"); } writer.endElement(); // configuration writer.endElement(); // workflow writer.endElement(); // action writer.endElement(); // coordinator streamWriter.flush(); } /** * Writes a Hadoop-style configuration property. */ public static void property(XMLWriter writer, String name, String value) { writer.startElement("property"); element(writer, "name", name); element(writer, "value", value); writer.endElement(); } public static void writeBundle(Class appClass, AppContext context, Path appPath, List<Schedule> schedules, OutputStream output) throws IOException { Configuration conf = context.getHadoopConf(); XmlStreamWriter streamWriter = WriterFactory.newXmlWriter(output); PrettyPrintXMLWriter writer = new PrettyPrintXMLWriter(streamWriter); writer.startElement(BUNDLE_ELEMENT); writer.addAttribute("name", appClass.getCanonicalName()); writer.addAttribute("xmlns", OOZIE_BUNDLE_NS); writer.startElement("parameters"); // Default to the HDFS scheme for the root path if none is provided. Path qualifiedPath = appPath.toUri().getScheme() == null ? appPath.makeQualified(URI.create("hdfs:/"), appPath) : appPath; property(writer, "kiteAppRoot", qualifiedPath.toString()); property(writer, "oozie.libpath", "${kiteAppRoot}/lib"); property(writer, "nameNode", conf.get("fs.default.name")); String resourceManager = conf.get("yarn.resourcemanager.address"); // MR2 uses YARN for the job tracker, but some Hadoop deployments // don't have the resoure manager setting visible. We work around this // by grabbing the job tracker setting and swapping to the resource // manager port. // TODO: is there a better way to deal with this? if (resourceManager == null) { String jobTracker = conf.get("mapred.job.tracker"); if (jobTracker != null) resourceManager = jobTracker.replace("8021", "8032"); } if (resourceManager != null) property(writer, "jobTracker", resourceManager); // TODO: handle application configuration. // if (appConfigPath != null) // property(writer, "appConfigPath", appConfigPath.toString()); writer.endElement(); // parameters int i = 0; for (Schedule schedule : schedules) { writer.startElement("coordinator"); writer.addAttribute("name", schedule.getName()); element(writer, "app-path", "${kiteAppRoot}/" + coordPath(schedule)); writer.endElement(); // coordinator } writer.endElement(); // bundle streamWriter.flush(); } /** * Loads the Kite views to be passed to a job at runtime. * * @return A map of named inputs to the corresponding views */ public static Map<String, View> loadViews(SchedulableJobManager manager, Configuration conf) { Map<String, View> views = Maps.newHashMap(); Collection<DataIn> inputs = manager.getInputs().values(); for (DataIn input : inputs) { String uri = conf.get("wf_" + OozieScheduling.toIdentifier(input.name())); views.put(input.name(), Datasets.load(uri, input.type())); } Collection<DataOut> outputs = manager.getOutputs().values(); for (DataOut output : outputs) { String uri = conf.get("wf_" + OozieScheduling.toIdentifier(output.name())); views.put(output.name(), Datasets.load(uri, output.type())); } return views; } /** * Returns the settings to be passed to a job runner. */ public static Map<String, String> getJobSettings(Schedule schedule, Configuration conf) { Map<String, String> settings = Maps.newHashMap(); settings.put(WORKFLOW_NOMINAL_TIME, "${" + COORD_NOMINAL_TIME + "}"); // Write the Hive metastore URI, if available. It may be null // in local or testing environments. if (conf.get(HIVE_METASTORE_URIS) != null) { settings.put(HIVE_METASTORE_URIS, conf.get(HIVE_METASTORE_URIS)); } // Include the dataset inputs to make them visible to the workflow. for (String name : schedule.getViewTemplates().keySet()) { settings.put("wf_" + toIdentifier(name), "${coord_" + toIdentifier(name) + "}"); } return settings; } public static void writeJobConfiguration(XMLWriter writer, Schedule schedule, Configuration conf) { property(writer, WORKFLOW_NOMINAL_TIME, "${" + COORD_NOMINAL_TIME + "}"); // Write the Hive metastore URI, if available. It may be null // in local or testing environments. if (conf.get(HIVE_METASTORE_URIS) != null) { property(writer, HIVE_METASTORE_URIS, conf.get(HIVE_METASTORE_URIS)); } // Include the dataset inputs to make them visible to the workflow. for (String name : schedule.getViewTemplates().keySet()) { property(writer, "wf_" + toIdentifier(name), "${coord_" + toIdentifier(name) + "}"); } } }