List of usage examples for org.apache.hadoop.mapred.jobcontrol Job Job
@SuppressWarnings("unchecked") public Job(JobConf jobConf, ArrayList<?> dependingJobs) throws IOException
From source file:org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler.java
License:Apache License
/** * The method that creates the Job corresponding to a MapReduceOper. * The assumption is that//from w w w.j a va 2 s . co m * every MapReduceOper will have a load and a store. The JobConf removes * the load operator and serializes the input filespec so that PigInputFormat can * take over the creation of splits. It also removes the store operator * and serializes the output filespec so that PigOutputFormat can take over * record writing. The remaining portion of the map plan and reduce plans are * serialized and stored for the PigMapReduce or PigMapOnly objects to take over * the actual running of the plans. * The Mapper & Reducer classes and the required key value formats are set. * Checks if this is a map only job and uses PigMapOnly class as the mapper * and uses PigMapReduce otherwise. * If it is a Map Reduce job, it is bound to have a package operator. Remove it from * the reduce plan and serializes it so that the PigMapReduce class can use it to package * the indexed tuples received by the reducer. * @param mro - The MapReduceOper for which the JobConf is required * @param config - the Configuration object from which JobConf is built * @param pigContext - The PigContext passed on from execution engine * @return Job corresponding to mro * @throws JobCreationException */ @SuppressWarnings({ "unchecked" }) private Job getJob(MROperPlan plan, MapReduceOper mro, Configuration config, PigContext pigContext) throws JobCreationException { org.apache.hadoop.mapreduce.Job nwJob = null; try { nwJob = new org.apache.hadoop.mapreduce.Job(config); } catch (Exception e) { throw new JobCreationException(e); } Configuration conf = nwJob.getConfiguration(); ArrayList<FileSpec> inp = new ArrayList<FileSpec>(); ArrayList<List<OperatorKey>> inpTargets = new ArrayList<List<OperatorKey>>(); ArrayList<String> inpSignatureLists = new ArrayList<String>(); ArrayList<Long> inpLimits = new ArrayList<Long>(); ArrayList<POStore> storeLocations = new ArrayList<POStore>(); Path tmpLocation = null; // add settings for pig statistics String setScriptProp = conf.get(PigConfiguration.INSERT_ENABLED, "true"); if (setScriptProp.equalsIgnoreCase("true")) { MRScriptState ss = MRScriptState.get(); ss.addSettingsToConf(mro, conf); } conf.set(MRConfiguration.MAPPER_NEW_API, "true"); conf.set(MRConfiguration.REDUCER_NEW_API, "true"); String buffPercent = conf.get(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT); if (buffPercent == null || Double.parseDouble(buffPercent) <= 0) { log.info(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT + " is not set, set to default 0.3"); conf.set(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT, "0.3"); } else { log.info(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT + " is set to " + conf.get(MRConfiguration.JOB_REDUCE_MARKRESET_BUFFER_PERCENT)); } configureCompression(conf); try { //Process the POLoads List<POLoad> lds = PlanHelper.getPhysicalOperators(mro.mapPlan, POLoad.class); if (lds != null && lds.size() > 0) { for (POLoad ld : lds) { LoadFunc lf = ld.getLoadFunc(); lf.setLocation(ld.getLFile().getFileName(), nwJob); //Store the inp filespecs inp.add(ld.getLFile()); } } if (!mro.reducePlan.isEmpty()) { log.info("Reduce phase detected, estimating # of required reducers."); adjustNumReducers(plan, mro, nwJob); } else { nwJob.setNumReduceTasks(0); } for (String udf : mro.UDFs) { if (udf.contains("GFCross")) { Object func = pigContext.instantiateFuncFromSpec(new FuncSpec(udf)); if (func instanceof GFCross) { String crossKey = ((GFCross) func).getCrossKey(); // If non GFCross has been processed yet if (pigContext.getProperties() .get(PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey) == null) { pigContext.getProperties().setProperty( PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey, Integer.toString(nwJob.getNumReduceTasks())); } conf.set(PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey, (String) pigContext .getProperties().get(PigConfiguration.PIG_CROSS_PARALLELISM_HINT + "." + crossKey)); } } } if (lds != null && lds.size() > 0) { for (POLoad ld : lds) { //Store the target operators for tuples read //from this input List<PhysicalOperator> ldSucs = mro.mapPlan.getSuccessors(ld); List<OperatorKey> ldSucKeys = new ArrayList<OperatorKey>(); if (ldSucs != null) { for (PhysicalOperator operator2 : ldSucs) { ldSucKeys.add(operator2.getOperatorKey()); } } inpTargets.add(ldSucKeys); inpSignatureLists.add(ld.getSignature()); inpLimits.add(ld.getLimit()); //Remove the POLoad from the plan if (!pigContext.inIllustrator) mro.mapPlan.remove(ld); } } if (!pigContext.inIllustrator && !pigContext.getExecType().isLocal()) { if (okToRunLocal(nwJob, mro, lds)) { log.info(SMALL_JOB_LOG_MSG); // override with the default conf to run in local mode for (Entry<String, String> entry : defaultConf) { String key = entry.getKey(); if (key.equals(MRConfiguration.REDUCE_TASKS) || key.equals(MRConfiguration.JOB_REDUCES)) { // this must not be set back to the default in case it has been set to 0 for example. continue; } if (key.startsWith("fs.")) { // we don't want to change fs settings back continue; } if (key.startsWith("io.")) { // we don't want to change io settings back continue; } String value = entry.getValue(); if (conf.get(key) == null || !conf.get(key).equals(value)) { conf.set(key, value); } } conf.setBoolean(PigImplConstants.CONVERTED_TO_LOCAL, true); } else { log.info(BIG_JOB_LOG_MSG); // Setup the DistributedCache for this job List<URL> allJars = new ArrayList<URL>(); for (URL extraJar : pigContext.extraJars) { if (!allJars.contains(extraJar)) { allJars.add(extraJar); } } for (String scriptJar : pigContext.scriptJars) { URL jar = new File(scriptJar).toURI().toURL(); if (!allJars.contains(jar)) { allJars.add(jar); } } for (String defaultJar : JarManager.getDefaultJars()) { URL jar = new File(defaultJar).toURI().toURL(); if (!allJars.contains(jar)) { allJars.add(jar); } } for (URL jar : allJars) { boolean predeployed = false; for (String predeployedJar : pigContext.predeployedJars) { if (predeployedJar.contains(new File(jar.toURI()).getName())) { predeployed = true; } } if (!predeployed) { log.info("Adding jar to DistributedCache: " + jar); putJarOnClassPathThroughDistributedCache(pigContext, conf, jar); } } File scriptUDFJarFile = JarManager.createPigScriptUDFJar(pigContext); if (scriptUDFJarFile != null) { putJarOnClassPathThroughDistributedCache(pigContext, conf, scriptUDFJarFile.toURI().toURL()); } } } if (Utils.isLocal(pigContext, conf)) { ConfigurationUtil.replaceConfigForLocalMode(conf); } conf.set("pig.inputs", ObjectSerializer.serialize(inp)); conf.set("pig.inpTargets", ObjectSerializer.serialize(inpTargets)); conf.set("pig.inpSignatures", ObjectSerializer.serialize(inpSignatureLists)); conf.set("pig.inpLimits", ObjectSerializer.serialize(inpLimits)); conf.set("pig.pigContext", ObjectSerializer.serialize(pigContext)); conf.set("udf.import.list", ObjectSerializer.serialize(PigContext.getPackageImportList())); // this is for unit tests since some don't create PigServer // if user specified the job name using -D switch, Pig won't reset the name then. if (System.getProperty(MRConfiguration.JOB_NAME) == null && pigContext.getProperties().getProperty(PigContext.JOB_NAME) != null) { nwJob.setJobName(pigContext.getProperties().getProperty(PigContext.JOB_NAME)); } if (pigContext.getProperties().getProperty(PigContext.JOB_PRIORITY) != null) { // If the job priority was set, attempt to get the corresponding enum value // and set the hadoop job priority. String jobPriority = pigContext.getProperties().getProperty(PigContext.JOB_PRIORITY).toUpperCase(); try { // Allow arbitrary case; the Hadoop job priorities are all upper case. conf.set(MRConfiguration.JOB_PRIORITY, JobPriority.valueOf(jobPriority).toString()); } catch (IllegalArgumentException e) { StringBuffer sb = new StringBuffer("The job priority must be one of ["); JobPriority[] priorities = JobPriority.values(); for (int i = 0; i < priorities.length; ++i) { if (i > 0) sb.append(", "); sb.append(priorities[i]); } sb.append("]. You specified [" + jobPriority + "]"); throw new JobCreationException(sb.toString()); } } setupDistributedCache(pigContext, conf, pigContext.getProperties(), "pig.streaming.ship.files", true); setupDistributedCache(pigContext, conf, pigContext.getProperties(), "pig.streaming.cache.files", false); nwJob.setInputFormatClass(PigInputFormat.class); // tmp file compression setups // PIG-3741 This must be done before setStoreLocation on POStores Utils.setTmpFileCompressionOnConf(pigContext, conf); //Process POStore and remove it from the plan LinkedList<POStore> mapStores = PlanHelper.getPhysicalOperators(mro.mapPlan, POStore.class); LinkedList<POStore> reduceStores = PlanHelper.getPhysicalOperators(mro.reducePlan, POStore.class); for (POStore st : mapStores) { storeLocations.add(st); StoreFuncInterface sFunc = st.getStoreFunc(); sFunc.setStoreLocation(st.getSFile().getFileName(), nwJob); if (sFunc instanceof OverwritableStoreFunc) { OverwritableStoreFunc osf = (OverwritableStoreFunc) sFunc; if (osf.shouldOverwrite()) { osf.cleanupOutput(st, nwJob); } } } for (POStore st : reduceStores) { storeLocations.add(st); StoreFuncInterface sFunc = st.getStoreFunc(); sFunc.setStoreLocation(st.getSFile().getFileName(), nwJob); if (sFunc instanceof OverwritableStoreFunc) { OverwritableStoreFunc osf = (OverwritableStoreFunc) sFunc; if (osf.shouldOverwrite()) { osf.cleanupOutput(st, nwJob); } } } setOutputFormat(nwJob); if (mapStores.size() + reduceStores.size() == 1) { // single store case log.info("Setting up single store job"); POStore st; if (reduceStores.isEmpty()) { st = mapStores.get(0); if (!pigContext.inIllustrator) mro.mapPlan.remove(st); } else { st = reduceStores.get(0); if (!pigContext.inIllustrator) mro.reducePlan.remove(st); } MapRedUtil.setupStreamingDirsConfSingle(st, pigContext, conf); } else if (mapStores.size() + reduceStores.size() > 0) { // multi store case log.info("Setting up multi store job"); MapRedUtil.setupStreamingDirsConfMulti(pigContext, conf); boolean disableCounter = conf.getBoolean("pig.disable.counter", false); if (disableCounter) { log.info("Disable Pig custom output counters"); } int idx = 0; for (POStore sto : storeLocations) { sto.setDisableCounter(disableCounter); sto.setMultiStore(true); sto.setIndex(idx++); } } // store map key type // this is needed when the key is null to create // an appropriate NullableXXXWritable object conf.set("pig.map.keytype", ObjectSerializer.serialize(new byte[] { mro.mapKeyType })); // set parent plan in all operators in map and reduce plans // currently the parent plan is really used only when POStream is present in the plan new PhyPlanSetter(mro.mapPlan).visit(); new PhyPlanSetter(mro.reducePlan).visit(); // this call modifies the ReplFiles names of POFRJoin operators // within the MR plans, must be called before the plans are // serialized setupDistributedCacheForJoin(mro, pigContext, conf); // Search to see if we have any UDFs that need to pack things into the // distributed cache. setupDistributedCacheForUdfs(mro, pigContext, conf); SchemaTupleFrontend.copyAllGeneratedToDistributedCache(pigContext, conf); POPackage pack = null; if (mro.reducePlan.isEmpty()) { //MapOnly Job nwJob.setMapperClass(PigMapOnly.Map.class); if (!pigContext.inIllustrator) conf.set("pig.mapPlan", ObjectSerializer.serialize(mro.mapPlan)); if (mro.isEndOfAllInputSetInMap()) { // this is used in Map.close() to decide whether the // pipeline needs to be rerun one more time in the close() // The pipeline is rerun if there either was a stream or POMergeJoin conf.set(END_OF_INP_IN_MAP, "true"); } } else { //Map Reduce Job //Process the POPackage operator and remove it from the reduce plan if (!mro.combinePlan.isEmpty()) { POPackage combPack = (POPackage) mro.combinePlan.getRoots().get(0); mro.combinePlan.remove(combPack); nwJob.setCombinerClass(PigCombiner.Combine.class); conf.set("pig.combinePlan", ObjectSerializer.serialize(mro.combinePlan)); conf.set("pig.combine.package", ObjectSerializer.serialize(combPack)); } else if (mro.needsDistinctCombiner()) { nwJob.setCombinerClass(DistinctCombiner.Combine.class); log.info("Setting identity combiner class."); } pack = (POPackage) mro.reducePlan.getRoots().get(0); if (!pigContext.inIllustrator) mro.reducePlan.remove(pack); nwJob.setMapperClass(PigMapReduce.Map.class); nwJob.setReducerClass(PigMapReduce.Reduce.class); if (mro.customPartitioner != null) nwJob.setPartitionerClass(PigContext.resolveClassName(mro.customPartitioner)); if (!pigContext.inIllustrator) conf.set("pig.mapPlan", ObjectSerializer.serialize(mro.mapPlan)); if (mro.isEndOfAllInputSetInMap()) { // this is used in Map.close() to decide whether the // pipeline needs to be rerun one more time in the close() // The pipeline is rerun only if there was a stream or merge-join. conf.set(END_OF_INP_IN_MAP, "true"); } if (!pigContext.inIllustrator) conf.set("pig.reducePlan", ObjectSerializer.serialize(mro.reducePlan)); if (mro.isEndOfAllInputSetInReduce()) { // this is used in Map.close() to decide whether the // pipeline needs to be rerun one more time in the close() // The pipeline is rerun only if there was a stream conf.set("pig.stream.in.reduce", "true"); } if (!pigContext.inIllustrator) conf.set("pig.reduce.package", ObjectSerializer.serialize(pack)); conf.set("pig.reduce.key.type", Byte.toString(pack.getPkgr().getKeyType())); if (mro.getUseSecondaryKey()) { nwJob.setGroupingComparatorClass(PigSecondaryKeyGroupComparator.class); nwJob.setPartitionerClass(SecondaryKeyPartitioner.class); nwJob.setSortComparatorClass(PigSecondaryKeyComparator.class); nwJob.setOutputKeyClass(NullableTuple.class); conf.set("pig.secondarySortOrder", ObjectSerializer.serialize(mro.getSecondarySortOrder())); } else { Class<? extends WritableComparable> keyClass = HDataType .getWritableComparableTypes(pack.getPkgr().getKeyType()).getClass(); nwJob.setOutputKeyClass(keyClass); selectComparator(mro, pack.getPkgr().getKeyType(), nwJob); } nwJob.setOutputValueClass(NullableTuple.class); } if (mro.isGlobalSort() || mro.isLimitAfterSort()) { if (mro.isGlobalSort()) { String symlink = addSingleFileToDistributedCache(pigContext, conf, mro.getQuantFile(), "pigsample"); conf.set("pig.quantilesFile", symlink); nwJob.setPartitionerClass(WeightedRangePartitioner.class); } if (mro.isUDFComparatorUsed) { boolean usercomparator = false; for (String compFuncSpec : mro.UDFs) { Class comparator = PigContext.resolveClassName(compFuncSpec); if (ComparisonFunc.class.isAssignableFrom(comparator)) { nwJob.setMapperClass(PigMapReduce.MapWithComparator.class); nwJob.setReducerClass(PigMapReduce.ReduceWithComparator.class); conf.set("pig.reduce.package", ObjectSerializer.serialize(pack)); conf.set("pig.usercomparator", "true"); nwJob.setOutputKeyClass(NullableTuple.class); nwJob.setSortComparatorClass(comparator); usercomparator = true; break; } } if (!usercomparator) { String msg = "Internal error. Can't find the UDF comparator"; throw new IOException(msg); } } else { conf.set("pig.sortOrder", ObjectSerializer.serialize(mro.getSortOrder())); } } if (mro.isSkewedJoin()) { String symlink = addSingleFileToDistributedCache(pigContext, conf, mro.getSkewedJoinPartitionFile(), "pigdistkey"); conf.set("pig.keyDistFile", symlink); nwJob.setPartitionerClass(SkewedPartitioner.class); nwJob.setMapperClass(PigMapReduce.MapWithPartitionIndex.class); nwJob.setMapOutputKeyClass(NullablePartitionWritable.class); nwJob.setGroupingComparatorClass(PigGroupingPartitionWritableComparator.class); } if (mro.isCounterOperation()) { if (mro.isRowNumber()) { nwJob.setMapperClass(PigMapReduceCounter.PigMapCounter.class); } else { nwJob.setReducerClass(PigMapReduceCounter.PigReduceCounter.class); } } if (mro.isRankOperation()) { Iterator<String> operationIDs = mro.getRankOperationId().iterator(); while (operationIDs.hasNext()) { String operationID = operationIDs.next(); Iterator<Pair<String, Long>> itPairs = globalCounters.get(operationID).iterator(); Pair<String, Long> pair = null; while (itPairs.hasNext()) { pair = itPairs.next(); conf.setLong(pair.first, pair.second); } } } if (!pigContext.inIllustrator) { // unset inputs for POStore, otherwise, map/reduce plan will be unnecessarily deserialized for (POStore st : mapStores) { st.setInputs(null); st.setParentPlan(null); } for (POStore st : reduceStores) { st.setInputs(null); st.setParentPlan(null); } conf.set(PIG_MAP_STORES, ObjectSerializer.serialize(mapStores)); conf.set(PIG_REDUCE_STORES, ObjectSerializer.serialize(reduceStores)); } String tmp; long maxCombinedSplitSize = 0; if (!mro.combineSmallSplits() || pigContext.getProperties().getProperty("pig.splitCombination", "true").equals("false")) conf.setBoolean("pig.noSplitCombination", true); else if ((tmp = pigContext.getProperties().getProperty("pig.maxCombinedSplitSize", null)) != null) { try { maxCombinedSplitSize = Long.parseLong(tmp); } catch (NumberFormatException e) { log.warn( "Invalid numeric format for pig.maxCombinedSplitSize; use the default maximum combined split size"); } } if (maxCombinedSplitSize > 0) conf.setLong("pig.maxCombinedSplitSize", maxCombinedSplitSize); // It's a hack to set distributed cache file for hadoop 23. Once MiniMRCluster do not require local // jar on fixed location, this can be removed if (pigContext.getExecType() == ExecType.MAPREDUCE) { String newfiles = conf.get("alternative.mapreduce.job.cache.files"); if (newfiles != null) { String files = conf.get(MRConfiguration.JOB_CACHE_FILES); conf.set(MRConfiguration.JOB_CACHE_FILES, files == null ? newfiles.toString() : files + "," + newfiles); } } // Serialize the UDF specific context info. UDFContext.getUDFContext().serialize(conf); Job cjob = new Job(new JobConf(conf), new ArrayList<Job>()); jobStoreMap.put(cjob, new Pair<List<POStore>, Path>(storeLocations, tmpLocation)); return cjob; } catch (JobCreationException jce) { throw jce; } catch (Exception e) { int errCode = 2017; String msg = "Internal error creating job configuration."; throw new JobCreationException(msg, errCode, PigException.BUG, e); } }