List of usage examples for org.apache.hadoop.mapred.jobcontrol Job setJobName
public void setJobName(String jobName)
From source file:com.ebay.erl.mobius.core.MobiusJob.java
License:Apache License
/** * Add a job, represented by the <code>aNewJob</code> object, into the execution queue. * <p>// w w w .j a va 2 s . c o m * * Users can use this method to add one or more jobs' configuration into the job queue, and Mobius engine * will analyze the <code>aNewJob</code> objects within the queue to understand the dependence of jobs. * For example, if job B's input is from job A, then job B won't be submitted until A is completed * successfully. If A failed, the B will not be submitted. * <p> * * * @param aNewJobConf a {@link Configuration} object represents a Hadoop job. * @throws IOException */ protected void addToExecQueue(Configuration aNewJobConf) throws IOException { // Add the new job into execution engine and realize // its dependency, if any. // // To realize the job dependency, we need to analyze the input // path of this new job. // // The inputs of a job could be: // 1) if aNewJob is not a derived job (ex: result of another MR job), // then the inputs of the job can be retrieved from "mapred.input.dir", // or from {@link MultipleInputs} (ex, joining different type of dataset)/ // 2) if aNewJob is a derived job, the input is from the output of previous // MR job. String inputFolders = aNewJobConf.get("mapred.input.dir", ""); if (inputFolders.length() == 0) { // the value of "mapred.input.dir" is empty, assuming the inputs of this job // are coming from {@link MultipleInputs}. String multipleInputs = aNewJobConf .get("mapred.input.dir.mappers"/* for using old MultipleInputs, v0.20.X */, aNewJobConf.get( "mapreduce.input.multipleinputs.dir.formats"/* for new MultipleInputs, v0.23.X */, "")); if (multipleInputs.length() > 0) { // the input paths of this job is coming from MultipleInputs, extract the input paths. // The format from {@link MultipleInputs} is like: hadoop_path1;corresponding_mapper1,hadoop_path2;corresponding_mapper2... String[] pathAndMapperPairs = multipleInputs.split(","); for (String aPair : pathAndMapperPairs) { String[] pathToMapper = aPair.split(";"); String path = pathToMapper[0]; String mapper = pathToMapper[1]; if (inputFolders.length() == 0) { inputFolders = getPathOnly(path); } else { inputFolders = inputFolders + "," + getPathOnly(path); } } } else { throw new IllegalArgumentException("Cannot find input path(s) of job: [" + aNewJobConf.get("mapred.job.name") + "] from the following attributes: " + "mapred.input.dir, mapred.input.dir.mappers, nor mapreduce.input.multipleinputs.dir.formats. " + "Please specify the input path(s) of this job."); } } else { // the input path of this job is specified in mapred.input.dir inputFolders = getPathOnly(inputFolders); } //////////////////////////////////////////////////////////// // validate output path of this job, to ensure it doesn't // use the same folder of another job's output. //////////////////////////////////////////////////////////// String outputPath = aNewJobConf.get("mapred.output.dir", ""); if (outputPath.isEmpty()) throw new IllegalStateException( "Please specify the output directory of job:" + aNewJobConf.get("mapred.job.name")); if (this.isOutputOfAnotherJob(outputPath)) { throw new IllegalArgumentException("Job [" + aNewJobConf.get("mapred.job.name") + "]'s output [" + outputPath + "] is " + "the output of job[" + jobTopology.get(outputPath).getJobName() + "], " + "please make sure to use different output folder for each job."); } ////////////////////////////////////////////////////////////////// // pass all the validation, start to build the dependencies. ////////////////////////////////////////////////////////////////// Job newJob = new ConfigurableJob(new JobConf(aNewJobConf, this.getClass())); newJob.setJobName(aNewJobConf.get("mapred.job.name", aNewJobConf.get("mapreduce.job.name", "Mobius Job"))); for (String anInputOfNewJob : inputFolders.split(",")) { // Added to track inputs for local PC sampling inputPaths.add(anInputOfNewJob); Job dependsOn = jobTopology.get(this.getFS().makeQualified(new Path(anInputOfNewJob)).toUri()); if (dependsOn != null) { List<Job> dependingJobs = newJob.getDependingJobs(); boolean alreadyInDependency = dependingJobs != null && dependingJobs.contains(dependsOn); if (alreadyInDependency) { // already added, do nothing. } else { LOGGER.info(newJob.getJobName() + " depends on " + dependsOn.getJobName()); newJob.addDependingJob(dependsOn); } } } // put the output of this <code>newJob</code> into job topology // so that later if a job read this <code>newJob</code>'s output // as its input, then the system can detect the dependency. URI outputPathURI = this.getFS().makeQualified(new Path(outputPath)).toUri(); LOGGER.info("Adding Job:" + newJob.getJobName() + "\tOutput:[" + outputPath.toString() + "]"); jobTopology.put(outputPathURI, newJob); }
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler.java
License:Apache License
/** * The method that creates the Job corresponding to a MapReduceOper. * The assumption is that//from w ww.j a v a2 s . co m * every MapReduceOper will have a load and a store. The JobConf removes * the load operator and serializes the input filespec so that PigInputFormat can * take over the creation of splits. It also removes the store operator * and serializes the output filespec so that PigOutputFormat can take over * record writing. The remaining portion of the map plan and reduce plans are * serialized and stored for the PigMapReduce or PigMapOnly objects to take over * the actual running of the plans. * The Mapper & Reducer classes and the required key value formats are set. * Checks if this is a map only job and uses PigMapOnly class as the mapper * and uses PigMapReduce otherwise. * If it is a Map Reduce job, it is bound to have a package operator. Remove it from * the reduce plan and serializes it so that the PigMapReduce class can use it to package * the indexed tuples received by the reducer. * @param mro - The MapReduceOper for which the JobConf is required * @param config - the Configuration object from which JobConf is built * @param pigContext - The PigContext passed on from execution engine * @return Job corresponding to mro * @throws JobCreationException */ @SuppressWarnings({ "unchecked" }) private Job getJob(MROperPlan plan, MapReduceOper mro, Configuration config, PigContext pigContext) throws JobCreationException { org.apache.hadoop.mapreduce.Job nwJob = null; try { nwJob = new org.apache.hadoop.mapreduce.Job(config); } catch (Exception e) { throw new JobCreationException(e); } Configuration conf = nwJob.getConfiguration(); ArrayList<FileSpec> inp = new ArrayList<FileSpec>(); ArrayList<List<OperatorKey>> inpTargets = new ArrayList<List<OperatorKey>>(); ArrayList<String> inpSignatureLists = new ArrayList<String>(); ArrayList<Long> inpLimits = new ArrayList<Long>(); ArrayList<POStore> storeLocations = new ArrayList<POStore>(); Path tmpLocation = null; // add settings for pig statistics String setScriptProp = conf.get(PigConfiguration.INSERT_ENABLED, "true"); if (setScriptProp.equalsIgnoreCase("true")) { MRScriptState ss = MRScriptState.get(); ss.addSettingsToConf(mro, conf); } conf.set(MRConfiguration.MAPPER_NEW_API, "true"); conf.set(MRConfiguration.REDUCER_NEW_API, "true"); String buffPercent = conf.get(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT); if (buffPercent == null || Double.parseDouble(buffPercent) <= 0) { log.info(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT + " is not set, set to default 0.3"); conf.set(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT, "0.3"); } else { log.info(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT + " is set to " + conf.get(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT)); } configureCompression(conf); try { //Process the POLoads List<POLoad> lds = PlanHelper.getPhysicalOperators(mro.mapPlan, POLoad.class); if (lds != null && lds.size() > 0) { for (POLoad ld : lds) { LoadFunc lf = ld.getLoadFunc(); lf.setLocation(ld.getLFile().getFileName(), nwJob); //Store the inp filespecs inp.add(ld.getLFile()); } } if (!mro.reducePlan.isEmpty()) { log.info("Reduce phase detected, estimating # of required reducers."); adjustNumReducers(plan, mro, nwJob); } else { nwJob.setNumReduceTasks(0); } for (String udf : mro.UDFs) { if (udf.contains("GFCross")) { Object func = pigContext.instantiateFuncFromSpec(new FuncSpec(udf)); if (func instanceof GFCross) { String crossKey = ((GFCross) func).getCrossKey(); // If non GFCross has been processed yet if (pigContext.getProperties() .get(PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey) == null) { pigContext.getProperties().setProperty( PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey, Integer.toString(nwJob.getNumReduceTasks())); } conf.set(PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey, (String) pigContext .getProperties().get(PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey)); } } } if (lds != null && lds.size() > 0) { for (POLoad ld : lds) { //Store the target operators for tuples read //from this input List<PhysicalOperator> ldSucs = mro.mapPlan.getSuccessors(ld); List<OperatorKey> ldSucKeys = new ArrayList<OperatorKey>(); if (ldSucs != null) { for (PhysicalOperator operator2 : ldSucs) { ldSucKeys.add(operator2.getOperatorKey()); } } inpTargets.add(ldSucKeys); inpSignatureLists.add(ld.getSignature()); inpLimits.add(ld.getLimit()); //Remove the POLoad from the plan if (!pigContext.inIllustrator) mro.mapPlan.remove(ld); } } if (!pigContext.inIllustrator && !pigContext.getExecType().isLocal()) { if (okToRunLocal(nwJob, mro, lds)) { log.info(SMALL_JOB_LOG_MSG); // override with the default conf to run in local mode for (Entry<String, String> entry : defaultConf) { String key = entry.getKey(); if (key.equals(MRConfiguration.REDUCE_TASKS) || key.equals(MRConfiguration.JOB_REDUCES)) { // this must not be set back to the default in case it has been set to 0 for example. continue; } if (key.startsWith("fs.")) { // we don't want to change fs settings back continue; } if (key.startsWith("io.")) { // we don't want to change io settings back continue; } String value = entry.getValue(); if (conf.get(key) == null || !conf.get(key).equals(value)) { conf.set(key, value); } } conf.setBoolean(PigImplConstants.CONVERTED_TO_LOCAL, true); } else { log.info(BIG_JOB_LOG_MSG); // Setup the DistributedCache for this job List<URL> allJars = new ArrayList<URL>(); for (URL extraJar : pigContext.extraJars) { if (!allJars.contains(extraJar)) { allJars.add(extraJar); } } for (String scriptJar : pigContext.scriptJars) { URL jar = new File(scriptJar).toURI().toURL(); if (!allJars.contains(jar)) { allJars.add(jar); } } for (String defaultJar : JarManager.getDefaultJars()) { URL jar = new File(defaultJar).toURI().toURL(); if (!allJars.contains(jar)) { allJars.add(jar); } } for (URL jar : allJars) { boolean predeployed = false; for (String predeployedJar : pigContext.predeployedJars) { if (predeployedJar.contains(new File(jar.toURI()).getName())) { predeployed = true; } } if (!predeployed) { log.info("Adding jar to DistributedCache: " + jar); putJarOnClassPathThroughDistributedCache(pigContext, conf, jar); } } File scriptUDFJarFile = JarManager.createPigScriptUDFJar(pigContext); if (scriptUDFJarFile != null) { putJarOnClassPathThroughDistributedCache(pigContext, conf, scriptUDFJarFile.toURI().toURL()); } } } if (Utils.isLocal(pigContext, conf)) { ConfigurationUtil.replaceConfigForLocalMode(conf); } conf.set("pig.inputs", ObjectSerializer.serialize(inp)); conf.set("pig.inpTargets", ObjectSerializer.serialize(inpTargets)); conf.set("pig.inpSignatures", ObjectSerializer.serialize(inpSignatureLists)); conf.set("pig.inpLimits", ObjectSerializer.serialize(inpLimits)); conf.set("pig.pigContext", ObjectSerializer.serialize(pigContext)); conf.set("udf.import.list", ObjectSerializer.serialize(PigContext.getPackageImportList())); // this is for unit tests since some don't create PigServer // if user specified the job name using -D switch, Pig won't reset the name then. if (System.getProperty(MRConfiguration.JOB_NAME) == null && pigContext.getProperties().getProperty(PigContext.JOB_NAME) != null) { nwJob.setJobName(pigContext.getProperties().getProperty(PigContext.JOB_NAME)); } if (pigContext.getProperties().getProperty(PigContext.JOB_PRIORITY) != null) { // If the job priority was set, attempt to get the corresponding enum value // and set the hadoop job priority. String jobPriority = pigContext.getProperties().getProperty(PigContext.JOB_PRIORITY).toUpperCase(); try { // Allow arbitrary case; the Hadoop job priorities are all upper case. conf.set(MRConfiguration.JOB_PRIORITY, JobPriority.valueOf(jobPriority).toString()); } catch (IllegalArgumentException e) { StringBuffer sb = new StringBuffer("The job priority must be one of ["); JobPriority[] priorities = JobPriority.values(); for (int i = 0; i < priorities.length; ++i) { if (i > 0) sb.append(", "); sb.append(priorities[i]); } sb.append("]. You specified [" + jobPriority + "]"); throw new JobCreationException(sb.toString()); } } setupDistributedCache(pigContext, conf, pigContext.getProperties(), "pig.streaming.ship.files", true); setupDistributedCache(pigContext, conf, pigContext.getProperties(), "pig.streaming.cache.files", false); nwJob.setInputFormatClass(PigInputFormat.class); // tmp file compression setups // PIG-3741 This must be done before setStoreLocation on POStores Utils.setTmpFileCompressionOnConf(pigContext, conf); //Process POStore and remove it from the plan LinkedList<POStore> mapStores = PlanHelper.getPhysicalOperators(mro.mapPlan, POStore.class); LinkedList<POStore> reduceStores = PlanHelper.getPhysicalOperators(mro.reducePlan, POStore.class); for (POStore st : mapStores) { storeLocations.add(st); StoreFuncInterface sFunc = st.getStoreFunc(); sFunc.setStoreLocation(st.getSFile().getFileName(), nwJob); if (sFunc instanceof OverwritableStoreFunc) { OverwritableStoreFunc osf = (OverwritableStoreFunc) sFunc; if (osf.shouldOverwrite()) { osf.cleanupOutput(st, nwJob); } } } for (POStore st : reduceStores) { storeLocations.add(st); StoreFuncInterface sFunc = st.getStoreFunc(); sFunc.setStoreLocation(st.getSFile().getFileName(), nwJob); if (sFunc instanceof OverwritableStoreFunc) { OverwritableStoreFunc osf = (OverwritableStoreFunc) sFunc; if (osf.shouldOverwrite()) { osf.cleanupOutput(st, nwJob); } } } setOutputFormat(nwJob); if (mapStores.size() + reduceStores.size() == 1) { // single store case log.info("Setting up single store job"); POStore st; if (reduceStores.isEmpty()) { st = mapStores.get(0); if (!pigContext.inIllustrator) mro.mapPlan.remove(st); } else { st = reduceStores.get(0); if (!pigContext.inIllustrator) mro.reducePlan.remove(st); } MapRedUtil.setupStreamingDirsConfSingle(st, pigContext, conf); } else if (mapStores.size() + reduceStores.size() > 0) { // multi store case log.info("Setting up multi store job"); MapRedUtil.setupStreamingDirsConfMulti(pigContext, conf); boolean disableCounter = conf.getBoolean("pig.disable.counter", false); if (disableCounter) { log.info("Disable Pig custom output counters"); } int idx = 0; for (POStore sto : storeLocations) { sto.setDisableCounter(disableCounter); sto.setMultiStore(true); sto.setIndex(idx++); } } // store map key type // this is needed when the key is null to create // an appropriate NullableXXXWritable object conf.set("pig.map.keytype", ObjectSerializer.serialize(new byte[] { mro.mapKeyType })); // set parent plan in all operators in map and reduce plans // currently the parent plan is really used only when POStream is present in the plan new PhyPlanSetter(mro.mapPlan).visit(); new PhyPlanSetter(mro.reducePlan).visit(); // this call modifies the ReplFiles names of POFRJoin operators // within the MR plans, must be called before the plans are // serialized setupDistributedCacheForJoin(mro, pigContext, conf); // Search to see if we have any UDFs that need to pack things into the // distributed cache. setupDistributedCacheForUdfs(mro, pigContext, conf); SchemaTupleFrontend.copyAllGeneratedToDistributedCache(pigContext, conf); POPackage pack = null; if (mro.reducePlan.isEmpty()) { //MapOnly Job nwJob.setMapperClass(PigMapOnly.Map.class); if (!pigContext.inIllustrator) conf.set("pig.mapPlan", ObjectSerializer.serialize(mro.mapPlan)); if (mro.isEndOfAllInputSetInMap()) { // this is used in Map.close() to decide whether the // pipeline needs to be rerun one more time in the close() // The pipeline is rerun if there either was a stream or POMergeJoin conf.set(END_OF_INP_IN_MAP, "true"); } } else { //Map Reduce Job //Process the POPackage operator and remove it from the reduce plan if (!mro.combinePlan.isEmpty()) { POPackage combPack = (POPackage) mro.combinePlan.getRoots().get(0); mro.combinePlan.remove(combPack); nwJob.setCombinerClass(PigCombiner.Combine.class); conf.set("pig.combinePlan", ObjectSerializer.serialize(mro.combinePlan)); conf.set("pig.combine.package", ObjectSerializer.serialize(combPack)); } else if (mro.needsDistinctCombiner()) { nwJob.setCombinerClass(DistinctCombiner.Combine.class); log.info("Setting identity combiner class."); } pack = (POPackage) mro.reducePlan.getRoots().get(0); if (!pigContext.inIllustrator) mro.reducePlan.remove(pack); nwJob.setMapperClass(PigMapReduce.Map.class); nwJob.setReducerClass(PigMapReduce.Reduce.class); if (mro.customPartitioner != null) nwJob.setPartitionerClass(PigContext.resolveClassName(mro.customPartitioner)); if (!pigContext.inIllustrator) conf.set("pig.mapPlan", ObjectSerializer.serialize(mro.mapPlan)); if (mro.isEndOfAllInputSetInMap()) { // this is used in Map.close() to decide whether the // pipeline needs to be rerun one more time in the close() // The pipeline is rerun only if there was a stream or merge-join. conf.set(END_OF_INP_IN_MAP, "true"); } if (!pigContext.inIllustrator) conf.set("pig.reducePlan", ObjectSerializer.serialize(mro.reducePlan)); if (mro.isEndOfAllInputSetInReduce()) { // this is used in Map.close() to decide whether the // pipeline needs to be rerun one more time in the close() // The pipeline is rerun only if there was a stream conf.set("pig.stream.in.reduce", "true"); } if (!pigContext.inIllustrator) conf.set("pig.reduce.package", ObjectSerializer.serialize(pack)); conf.set("pig.reduce.key.type", Byte.toString(pack.getPkgr().getKeyType())); if (mro.getUseSecondaryKey()) { nwJob.setGroupingComparatorClass(PigSecondaryKeyGroupComparator.class); nwJob.setPartitionerClass(SecondaryKeyPartitioner.class); nwJob.setSortComparatorClass(PigSecondaryKeyComparator.class); nwJob.setOutputKeyClass(NullableTuple.class); conf.set("pig.secondarySortOrder", ObjectSerializer.serialize(mro.getSecondarySortOrder())); } else { Class<? extends WritableComparable> keyClass = HDataType .getWritableComparableTypes(pack.getPkgr().getKeyType()).getClass(); nwJob.setOutputKeyClass(keyClass); selectComparator(mro, pack.getPkgr().getKeyType(), nwJob); } nwJob.setOutputValueClass(NullableTuple.class); } if (mro.isGlobalSort() || mro.isLimitAfterSort()) { if (mro.isGlobalSort()) { String symlink = addSingleFileToDistributedCache(pigContext, conf, mro.getQuantFile(), "pigsample"); conf.set("pig.quantilesFile", symlink); nwJob.setPartitionerClass(WeightedRangePartitioner.class); } if (mro.isUDFComparatorUsed) { boolean usercomparator = false; for (String compFuncSpec : mro.UDFs) { Class comparator = PigContext.resolveClassName(compFuncSpec); if (ComparisonFunc.class.isAssignableFrom(comparator)) { nwJob.setMapperClass(PigMapReduce.MapWithComparator.class); nwJob.setReducerClass(PigMapReduce.ReduceWithComparator.class); conf.set("pig.reduce.package", ObjectSerializer.serialize(pack)); conf.set("pig.usercomparator", "true"); nwJob.setOutputKeyClass(NullableTuple.class); nwJob.setSortComparatorClass(comparator); usercomparator = true; break; } } if (!usercomparator) { String msg = "Internal error. Can't find the UDF comparator"; throw new IOException(msg); } } else { conf.set("pig.sortOrder", ObjectSerializer.serialize(mro.getSortOrder())); } } if (mro.isSkewedJoin()) { String symlink = addSingleFileToDistributedCache(pigContext, conf, mro.getSkewedJoinPartitionFile(), "pigdistkey"); conf.set("pig.keyDistFile", symlink); nwJob.setPartitionerClass(SkewedPartitioner.class); nwJob.setMapperClass(PigMapReduce.MapWithPartitionIndex.class); nwJob.setMapOutputKeyClass(NullablePartitionWritable.class); nwJob.setGroupingComparatorClass(PigGroupingPartitionWritableComparator.class); } if (mro.isCounterOperation()) { if (mro.isRowNumber()) { nwJob.setMapperClass(PigMapReduceCounter.PigMapCounter.class); } else { nwJob.setReducerClass(PigMapReduceCounter.PigReduceCounter.class); } } if (mro.isRankOperation()) { Iterator<String> operationIDs = mro.getRankOperationId().iterator(); while (operationIDs.hasNext()) { String operationID = operationIDs.next(); Iterator<Pair<String, Long>> itPairs = globalCounters.get(operationID).iterator(); Pair<String, Long> pair = null; while (itPairs.hasNext()) { pair = itPairs.next(); conf.setLong(pair.first, pair.second); } } } if (!pigContext.inIllustrator) { // unset inputs for POStore, otherwise, map/reduce plan will be unnecessarily deserialized for (POStore st : mapStores) { st.setInputs(null); st.setParentPlan(null); } for (POStore st : reduceStores) { st.setInputs(null); st.setParentPlan(null); } conf.set(PIG_MAP_STORES, ObjectSerializer.serialize(mapStores)); conf.set(PIG_REDUCE_STORES, ObjectSerializer.serialize(reduceStores)); } String tmp; long maxCombinedSplitSize = 0; if (!mro.combineSmallSplits() || pigContext.getProperties().getProperty("pig.splitCombination", "true").equals("false")) conf.setBoolean("pig.noSplitCombination", true); else if ((tmp = pigContext.getProperties().getProperty("pig.maxCombinedSplitSize", null)) != null) { try { maxCombinedSplitSize = Long.parseLong(tmp); } catch (NumberFormatException e) { log.warn( "Invalid numeric format for pig.maxCombinedSplitSize; use the default maximum combined split size"); } } if (maxCombinedSplitSize > 0) conf.setLong("pig.maxCombinedSplitSize", maxCombinedSplitSize); // It's a hack to set distributed cache file for hadoop 23. Once MiniMRCluster do not require local // jar on fixed location, this can be removed if (pigContext.getExecType() == ExecType.MAPREDUCE) { String newfiles = conf.get("alternative.mapreduce.job.cache.files"); if (newfiles != null) { String files = conf.get(MRConfiguration.JOB_CACHE_FILES); conf.set(MRConfiguration.JOB_CACHE_FILES, files == null ? newfiles.toString() : files + "," + newfiles); } } // Serialize the UDF specific context info. UDFContext.getUDFContext().serialize(conf); Job cjob = new Job(new JobConf(conf), new ArrayList<Job>()); jobStoreMap.put(cjob, new Pair<List<POStore>, Path>(storeLocations, tmpLocation)); return cjob; } catch (JobCreationException jce) { throw jce; } catch (Exception e) { int errCode = 2017; String msg = "Internal error creating job configuration."; throw new JobCreationException(msg, errCode, PigException.BUG, e); } }