List of usage examples for org.apache.hadoop.mapred TaskReport getStartTime
public long getStartTime()
From source file:azkaban.jobtype.MapReduceJobState.java
License:Apache License
public MapReduceJobState(RunningJob runningJob, TaskReport[] mapTaskReport, TaskReport[] reduceTaskReport) throws IOException { jobId = runningJob.getID().toString(); jobName = runningJob.getJobName();// ww w . jav a 2s. c om trackingURL = runningJob.getTrackingURL(); isComplete = runningJob.isComplete(); isSuccessful = runningJob.isSuccessful(); mapProgress = runningJob.mapProgress(); reduceProgress = runningJob.reduceProgress(); failureInfo = runningJob.getFailureInfo(); totalMappers = mapTaskReport.length; totalReducers = reduceTaskReport.length; for (TaskReport report : mapTaskReport) { if (report.getStartTime() < jobStartTime || jobStartTime == 0L) { jobStartTime = report.getStartTime(); } TIPStatus status = report.getCurrentStatus(); if (status != TIPStatus.PENDING && status != TIPStatus.RUNNING) { finishedMappersCount++; } } for (TaskReport report : reduceTaskReport) { if (jobLastUpdateTime < report.getFinishTime()) { jobLastUpdateTime = report.getFinishTime(); } TIPStatus status = report.getCurrentStatus(); if (status != TIPStatus.PENDING && status != TIPStatus.RUNNING) { finishedReducersCount++; } } // If not all the reducers are finished. if (finishedReducersCount != reduceTaskReport.length || jobLastUpdateTime == 0) { jobLastUpdateTime = System.currentTimeMillis(); } counters = runningJob.getCounters(); }
From source file:co.cask.cdap.app.mapreduce.MRJobClient.java
License:Apache License
private List<MRTaskInfo> toMRTaskInfos(TaskReport[] taskReports) { List<MRTaskInfo> taskInfos = Lists.newArrayList(); for (TaskReport taskReport : taskReports) { taskInfos.add(new MRTaskInfo(taskReport.getTaskId(), taskReport.getState(), taskReport.getStartTime(), taskReport.getFinishTime(), taskReport.getProgress(), groupToMap(taskReport.getCounters().getGroup(TaskCounter.class.getName())))); }//from w w w . j a v a 2 s .c om return taskInfos; }
From source file:com.atlantbh.jmeter.plugins.hadooputilities.jobstatistics.TaskLayer.java
License:Apache License
public String getTaskStatisticsByJobId(String jobTracker, String jobId) throws IOException { StringBuilder taskStatistics = new StringBuilder(); long taskDuration; String duration;//from w ww. j a v a2s . c o m JobID id = this.convertToJobId(jobId); JobClient client = this.prepareJobClient(jobTracker); RunningJob job = client.getJob(id); TaskReport[] mapTaskReports = client.getMapTaskReports(id); TaskReport[] reduceTaskReports = client.getReduceTaskReports(id); taskStatistics.append("<job id='").append(jobId).append("' name='").append(job.getJobName()).append("'>\n"); taskStatistics.append(" <mapTasks>\n"); for (TaskReport mapTaskReport : mapTaskReports) { taskDuration = mapTaskReport.getFinishTime() - mapTaskReport.getStartTime(); if (taskDuration < 0) { duration = "N/A"; } else { duration = String.valueOf(taskDuration); } double progress = mapTaskReport.getProgress() * 100; String taskProgress = Double.toString(progress) + "%"; taskStatistics.append(" <task id='").append(mapTaskReport.getTaskID().toString()).append("'\n"); taskStatistics.append(" <progress>").append(taskProgress).append("</progress>\n"); taskStatistics.append(" <duration>").append(duration).append("</duration>\n"); taskStatistics.append(" <status>").append(mapTaskReport.getCurrentStatus().toString()) .append("</status>\n"); taskStatistics.append(" </task>\n"); } taskStatistics.append(" </mapTasks>\n"); taskStatistics.append(" <reduceTasks>\n"); for (TaskReport reduceTaskReport : reduceTaskReports) { taskDuration = reduceTaskReport.getFinishTime() - reduceTaskReport.getStartTime(); if (taskDuration < 0) { duration = "N/A"; } else { duration = String.valueOf(taskDuration); } double progress = reduceTaskReport.getProgress() * 100; String taskProgress = Double.toString(progress) + "%"; taskStatistics.append(" <task id='").append(reduceTaskReport.getTaskID().toString()).append("'\n"); taskStatistics.append(" <progress>").append(taskProgress).append("</progress>\n"); taskStatistics.append(" <duration>").append(duration).append("</duration>\n"); taskStatistics.append(" <status>").append(reduceTaskReport.getCurrentStatus().toString()) .append("</status>\n"); taskStatistics.append(" </task>\n"); } taskStatistics.append(" </reduceTasks>\n"); taskStatistics.append("</job>"); return taskStatistics.toString(); }
From source file:com.impetus.ankush2.hadoop.monitor.JobStatusProvider.java
License:Open Source License
/** * Gets the task report./*from w ww . j ava2 s .c o m*/ * * @param taskReports * the task reports * @return the task report */ private Map<String, Object> getTaskReport(TaskReport[] taskReports) { Map<String, Object> taskReportsInfo = new HashMap<String, Object>(); try { LOG.info("Total Task : " + taskReports.length); List<Map> taskLists = new ArrayList<Map>(); // A report on the state of a task. if (taskReports != null) { int completeTask = 0; int failedTask = 0; int killedTask = 0; int runningTask = 0; int pendingTask = 0; Map<String, Object[]> diagInfo = new HashMap<String, Object[]>(); // Iterating over the task reports for (TaskReport mtr : taskReports) { // Creating an empty map for storing task details Map<String, Object> taskReport = new HashMap<String, Object>(); // The current status of the task TIPStatus currentStatus = mtr.getCurrentStatus(); // Checking for task's current status COMPLETE if (currentStatus == TIPStatus.COMPLETE) { completeTask++; } // Checking for task's current status KILLED if (currentStatus == TIPStatus.KILLED) { killedTask++; } // Checking for task's current status RUNNING if (currentStatus == TIPStatus.RUNNING) { runningTask++; } // Checking for task's current status PENDING if (currentStatus == TIPStatus.PENDING) { pendingTask++; } // The id of the task. TaskID taskId = mtr.getTaskID(); float progress = mtr.getProgress(); // The most recent state String state = mtr.getState(); // Putting value in a map taskReport.put("taskId", taskId.toString()); taskReport.put("successfulTaskAttemp", mtr.getSuccessfulTaskAttempt().toString()); taskReport.put("startTime", mtr.getStartTime()); taskReport.put("finishTime", mtr.getFinishTime()); taskReport.put("progress", progress * 100); taskReport.put("state", state); taskReport.put("currentStatus", currentStatus); Counters counters = mtr.getCounters(); List countersList = new ArrayList(); for (Group group : counters) { Map<String, Object> counterMap = new HashMap<String, Object>(); counterMap.put("name", group.getDisplayName()); List subCounters = new ArrayList(); for (Counter counter : group) { Map subCounter = new HashMap(); subCounter.put("name", counter.getDisplayName()); subCounter.put("value", counter.getCounter()); subCounters.add(subCounter); } counterMap.put("subCounters", subCounters); countersList.add(counterMap); } taskReport.put("counters", countersList); taskLists.add(taskReport); // A list of error messages. String[] diagnostics = mtr.getDiagnostics(); if (diagnostics != null) { int count = 0; // Iterating over the list of error messages for (String di : diagnostics) { Object[] diagStatus = new Object[2]; diagStatus[0] = taskId; diagStatus[1] = di; diagInfo.put(taskId + "_" + count, diagStatus); count++; } } } // Putting value in a map taskReportsInfo.put("completedTask", completeTask); taskReportsInfo.put("pendingTask", pendingTask); taskReportsInfo.put("killedTask", killedTask); taskReportsInfo.put("runningTask", runningTask); taskReportsInfo.put("failedTask", failedTask); taskReportsInfo.put("failedOrKilledTask", failedTask); taskReportsInfo.put("diagInfo", diagInfo); taskReportsInfo.put("tasks", taskLists); } } catch (Exception e) { HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, "Could not get task report", Constant.Component.Name.HADOOP, e); } return taskReportsInfo; }
From source file:com.netflix.lipstick.pigtolipstick.BasicP2LClient.java
License:Apache License
protected void updatePlanStatusForCompletedJobId(P2jPlanStatus planStatus, String jobId) { LOG.info("Updating plan status for completed job " + jobId); updatePlanStatusForJobId(planStatus, jobId); JobClient jobClient = PigStats.get().getJobClient(); JobID jobID = JobID.forName(jobId);/*from ww w.j a va 2 s .c o m*/ long startTime = Long.MAX_VALUE; long finishTime = Long.MIN_VALUE; /* The JobClient doesn't expose a way to get the Start and Finish time of the over all job[1] sadly, so we're pulling out the min task start time and max task finish time and using these to approximate. [1] - Which is really dumb. The data obviously exists, it gets rendered in the job tracker via the JobInProgress but sadly this is internal to the remote job tracker so we don't have access to this information. */ try { List<TaskReport> reports = Lists.newArrayList(); reports.addAll(Arrays.asList(jobClient.getMapTaskReports(jobID))); reports.addAll(Arrays.asList(jobClient.getReduceTaskReports(jobID))); reports.addAll(Arrays.asList(jobClient.getCleanupTaskReports(jobID))); reports.addAll(Arrays.asList(jobClient.getSetupTaskReports(jobID))); for (TaskReport rpt : reports) { /* rpt.getStartTime() sometimes returns zero meaning it does not know what time it started so we need to prevent using this or we'll lose the actual lowest start time */ long taskStartTime = rpt.getStartTime(); if (0 != taskStartTime) { startTime = Math.min(startTime, taskStartTime); } finishTime = Math.max(finishTime, rpt.getFinishTime()); } P2jJobStatus jobStatus = jobIdToJobStatusMap.get(jobId); if (startTime < Long.MAX_VALUE) { jobStatus.setStartTime(startTime); } if (finishTime > Long.MIN_VALUE) { jobStatus.setFinishTime(finishTime); } LOG.info("Determined start and finish times for job " + jobId); } catch (IOException e) { LOG.error("Error getting job info.", e); } }
From source file:com.twitter.hraven.hadoopJobMonitor.AppStatusCheckerTest.java
License:Apache License
public boolean testTask(TaskType taskType, String confParamName, long durationMin, final int MAX_RUN, float progress, boolean enforce, boolean dryRun, TIPStatus status, boolean wellBahaved, boolean killed) throws Exception { setTaskAttemptXML(durationMin * MIN, progress); TaskReport taskReport = mock(TaskReport.class); when(taskReport.getCurrentStatus()).thenReturn(status); Collection<TaskAttemptID> attempts = new ArrayList<TaskAttemptID>(); attempts.add(taskAttemptId);// w w w.j a v a2s . c o m when(taskReport.getRunningTaskAttemptIds()).thenReturn(attempts); when(taskReport.getTaskID()).thenReturn(org.apache.hadoop.mapred.TaskID.downgrade(taskId)); when(taskReport.getProgress()).thenReturn(progress); vConf.setBoolean(HadoopJobMonitorConfiguration.DRY_RUN, dryRun); Configuration remoteAppConf = new Configuration(); remoteAppConf.setInt(confParamName, MAX_RUN); remoteAppConf.setBoolean(HadoopJobMonitorConfiguration.enforced(confParamName), enforce); when(taskReport.getStartTime()).thenReturn(now - durationMin * MIN); AppConfiguraiton appConf = new AppConfiguraiton(remoteAppConf, vConf); AppConfCache.getInstance().put(appId, appConf); appStatusChecker.init(); appStatusChecker.loadClientService(); boolean res = appStatusChecker.checkTask(taskType, taskReport, now); if (wellBahaved) assertEquals("Well-bahved task does not pass the check", wellBahaved, res); else assertEquals("Not Well-bahved task passes the check", wellBahaved, res); if (killed) { killCounter++; verify(clientService, times(killCounter)).killTask(any(TaskAttemptID.class), Mockito.anyBoolean()); } else verify(clientService, times(killCounter)).killTask(any(TaskAttemptID.class), Mockito.anyBoolean()); return res; }
From source file:datafu.hourglass.jobs.StagedOutputJob.java
License:Apache License
/** * Writes Hadoop counters and other task statistics to a file in the file system. * /* ww w.jav a 2 s. com*/ * @param fs * @throws IOException */ private void writeCounters(final FileSystem fs) throws IOException { final Path actualOutputPath = FileOutputFormat.getOutputPath(this); SimpleDateFormat timestampFormat = new SimpleDateFormat("yyyyMMddHHmmss"); String suffix = timestampFormat.format(new Date()); if (_countersParentPath != null) { if (!fs.exists(_countersParentPath)) { _log.info("Creating counter parent path " + _countersParentPath); fs.mkdirs(_countersParentPath, FsPermission.valueOf("-rwxrwxr-x")); } // make the name as unique as possible in this case because this may be a directory // where other counter files will be dropped _countersPath = new Path(_countersParentPath, ".counters." + suffix); } else { _countersPath = new Path(actualOutputPath, ".counters." + suffix); } _log.info(String.format("Writing counters to %s", _countersPath)); FSDataOutputStream counterStream = fs.create(_countersPath); BufferedOutputStream buffer = new BufferedOutputStream(counterStream, 256 * 1024); OutputStreamWriter writer = new OutputStreamWriter(buffer); for (String groupName : getCounters().getGroupNames()) { for (Counter counter : getCounters().getGroup(groupName)) { writeAndLog(writer, String.format("%s=%d", counter.getName(), counter.getValue())); } } JobID jobID = this.getJobID(); org.apache.hadoop.mapred.JobID oldJobId = new org.apache.hadoop.mapred.JobID(jobID.getJtIdentifier(), jobID.getId()); long minStart = Long.MAX_VALUE; long maxFinish = 0; long setupStart = Long.MAX_VALUE; long cleanupFinish = 0; DescriptiveStatistics mapStats = new DescriptiveStatistics(); DescriptiveStatistics reduceStats = new DescriptiveStatistics(); boolean success = true; JobClient jobClient = new JobClient(this.conf); Map<String, String> taskIdToType = new HashMap<String, String>(); TaskReport[] setupReports = jobClient.getSetupTaskReports(oldJobId); if (setupReports.length > 0) { _log.info("Processing setup reports"); for (TaskReport report : jobClient.getSetupTaskReports(oldJobId)) { taskIdToType.put(report.getTaskID().toString(), "SETUP"); if (report.getStartTime() == 0) { _log.warn("Skipping report with zero start time"); continue; } setupStart = Math.min(setupStart, report.getStartTime()); } } else { _log.error("No setup reports"); } TaskReport[] mapReports = jobClient.getMapTaskReports(oldJobId); if (mapReports.length > 0) { _log.info("Processing map reports"); for (TaskReport report : mapReports) { taskIdToType.put(report.getTaskID().toString(), "MAP"); if (report.getFinishTime() == 0 || report.getStartTime() == 0) { _log.warn("Skipping report with zero start or finish time"); continue; } minStart = Math.min(minStart, report.getStartTime()); mapStats.addValue(report.getFinishTime() - report.getStartTime()); } } else { _log.error("No map reports"); } TaskReport[] reduceReports = jobClient.getReduceTaskReports(oldJobId); if (reduceReports.length > 0) { _log.info("Processing reduce reports"); for (TaskReport report : reduceReports) { taskIdToType.put(report.getTaskID().toString(), "REDUCE"); if (report.getFinishTime() == 0 || report.getStartTime() == 0) { _log.warn("Skipping report with zero start or finish time"); continue; } maxFinish = Math.max(maxFinish, report.getFinishTime()); reduceStats.addValue(report.getFinishTime() - report.getStartTime()); } } else { _log.error("No reduce reports"); } TaskReport[] cleanupReports = jobClient.getCleanupTaskReports(oldJobId); if (cleanupReports.length > 0) { _log.info("Processing cleanup reports"); for (TaskReport report : cleanupReports) { taskIdToType.put(report.getTaskID().toString(), "CLEANUP"); if (report.getFinishTime() == 0) { _log.warn("Skipping report with finish time of zero"); continue; } cleanupFinish = Math.max(cleanupFinish, report.getFinishTime()); } } else { _log.error("No cleanup reports"); } if (minStart == Long.MAX_VALUE) { _log.error("Could not determine map-reduce start time"); success = false; } if (maxFinish == 0) { _log.error("Could not determine map-reduce finish time"); success = false; } if (setupStart == Long.MAX_VALUE) { _log.error("Could not determine setup start time"); success = false; } if (cleanupFinish == 0) { _log.error("Could not determine cleanup finish time"); success = false; } // Collect statistics on successful/failed/killed task attempts, categorized by setup/map/reduce/cleanup. // Unfortunately the job client doesn't have an easier way to get these statistics. Map<String, Integer> attemptStats = new HashMap<String, Integer>(); _log.info("Processing task attempts"); for (TaskCompletionEvent event : getTaskCompletionEvents(jobClient, oldJobId)) { String type = taskIdToType.get(event.getTaskAttemptId().getTaskID().toString()); String status = event.getTaskStatus().toString(); String key = String.format("%s_%s_ATTEMPTS", status, type); if (!attemptStats.containsKey(key)) { attemptStats.put(key, 0); } attemptStats.put(key, attemptStats.get(key) + 1); } if (success) { writeAndLog(writer, String.format("SETUP_START_TIME_MS=%d", setupStart)); writeAndLog(writer, String.format("CLEANUP_FINISH_TIME_MS=%d", cleanupFinish)); writeAndLog(writer, String.format("COMPLETE_WALL_CLOCK_TIME_MS=%d", cleanupFinish - setupStart)); writeAndLog(writer, String.format("MAP_REDUCE_START_TIME_MS=%d", minStart)); writeAndLog(writer, String.format("MAP_REDUCE_FINISH_TIME_MS=%d", maxFinish)); writeAndLog(writer, String.format("MAP_REDUCE_WALL_CLOCK_TIME_MS=%d", maxFinish - minStart)); writeAndLog(writer, String.format("MAP_TOTAL_TASKS=%d", (long) mapStats.getN())); writeAndLog(writer, String.format("MAP_MAX_TIME_MS=%d", (long) mapStats.getMax())); writeAndLog(writer, String.format("MAP_MIN_TIME_MS=%d", (long) mapStats.getMin())); writeAndLog(writer, String.format("MAP_AVG_TIME_MS=%d", (long) mapStats.getMean())); writeAndLog(writer, String.format("MAP_STD_TIME_MS=%d", (long) mapStats.getStandardDeviation())); writeAndLog(writer, String.format("MAP_SUM_TIME_MS=%d", (long) mapStats.getSum())); writeAndLog(writer, String.format("REDUCE_TOTAL_TASKS=%d", (long) reduceStats.getN())); writeAndLog(writer, String.format("REDUCE_MAX_TIME_MS=%d", (long) reduceStats.getMax())); writeAndLog(writer, String.format("REDUCE_MIN_TIME_MS=%d", (long) reduceStats.getMin())); writeAndLog(writer, String.format("REDUCE_AVG_TIME_MS=%d", (long) reduceStats.getMean())); writeAndLog(writer, String.format("REDUCE_STD_TIME_MS=%d", (long) reduceStats.getStandardDeviation())); writeAndLog(writer, String.format("REDUCE_SUM_TIME_MS=%d", (long) reduceStats.getSum())); writeAndLog(writer, String.format("MAP_REDUCE_SUM_TIME_MS=%d", (long) mapStats.getSum() + (long) reduceStats.getSum())); for (Map.Entry<String, Integer> attemptStat : attemptStats.entrySet()) { writeAndLog(writer, String.format("%s=%d", attemptStat.getKey(), attemptStat.getValue())); } } writer.close(); buffer.close(); counterStream.close(); }
From source file:dataload.LogFetchJobTracker.java
License:Apache License
/** * This does the insertion of a given Task Report into the table * @param prepStatement/*from w w w . ja v a 2s . c om*/ * @param reports * @param id * @throws SQLException */ public void insertTaskIntoTable(PreparedStatement prepStatement, TaskReport[] reports, JobID id) throws SQLException { for (TaskReport rep : reports) { Counters c = rep.getCounters(); Iterator<Counters.Group> itrG = c.iterator(); prepStatement = connection.prepareStatement("INSERT INTO " + id + " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"); for (int i = 1; i < 24; i++) { prepStatement.setLong(i, 0); } prepStatement.setString(24, rep.getTaskID().toString()); prepStatement.setLong(25, 0); if (!(rep.getFinishTime() == 0) && !(rep.getStartTime() == 0)) { prepStatement.setLong(25, (rep.getFinishTime() - rep.getStartTime()) / 1000); totalTime += (rep.getFinishTime() - rep.getStartTime()); } else { prepStatement.setLong(25, 0); } while (itrG.hasNext()) { Iterator<Counters.Counter> itrC = itrG.next().iterator(); while (itrC.hasNext()) { Counters.Counter counter = itrC.next(); if (mapCounter.get(counter.getName()) != null) { prepStatement.setLong(mapCounter.get(counter.getName()), counter.getCounter()); } } } prepStatement.executeUpdate(); } }
From source file:org.apache.pig.backend.hadoop.executionengine.Launcher.java
License:Apache License
protected long computeTimeSpent(TaskReport[] taskReports) { long timeSpent = 0; for (TaskReport r : taskReports) { timeSpent += (r.getFinishTime() - r.getStartTime()); }/*from w w w . j av a2 s.c om*/ return timeSpent; }
From source file:org.apache.pig.backend.hadoop.executionengine.mapreduceExec.MapReduceLauncher.java
License:Apache License
/** * Submit a Pig job to hadoop./*from ww w . j a v a 2s .c om*/ * * @param mapFuncs * a list of map functions to apply to the inputs. The cardinality of the list should * be the same as input's cardinality. * @param groupFuncs * a list of grouping functions to apply to the inputs. The cardinality of the list * should be the same as input's cardinality. * @param reduceFunc * the reduce function. * @param mapTasks * the number of map tasks to use. * @param reduceTasks * the number of reduce tasks to use. * @param input * a list of inputs * @param output * the path of the output. * @return an indicator of success or failure. * @throws IOException */ public boolean launchPig(POMapreduce pom) throws IOException { JobConf conf = new JobConf(config); setJobProperties(conf, pom); Properties properties = pom.pigContext.getProperties(); ConfigurationValidator.validatePigProperties(properties); String jobName = properties.getProperty(PigContext.JOB_NAME); conf.setJobName(jobName); boolean success = false; List<String> funcs = new ArrayList<String>(); if (pom.toMap != null) { for (EvalSpec es : pom.toMap) funcs.addAll(es.getFuncs()); } if (pom.groupFuncs != null) { for (EvalSpec es : pom.groupFuncs) funcs.addAll(es.getFuncs()); } if (pom.toReduce != null) { funcs.addAll(pom.toReduce.getFuncs()); } // create jobs.jar locally and pass it to hadoop File submitJarFile = File.createTempFile("Job", ".jar"); try { FileOutputStream fos = new FileOutputStream(submitJarFile); JarManager.createJar(fos, funcs, null, pom.pigContext); log.debug("Job jar size = " + submitJarFile.length()); conf.setJar(submitJarFile.getPath()); String user = System.getProperty("user.name"); conf.setUser(user != null ? user : "Pigster"); conf.set("pig.spill.size.threshold", properties.getProperty("pig.spill.size.threshold")); conf.set("pig.spill.gc.activation.size", properties.getProperty("pig.spill.gc.activation.size")); if (pom.reduceParallelism != -1) { conf.setNumReduceTasks(pom.reduceParallelism); } if (pom.toMap != null) { conf.set("pig.mapFuncs", ObjectSerializer.serialize(pom.toMap)); } if (pom.toCombine != null) { conf.set("pig.combineFunc", ObjectSerializer.serialize(pom.toCombine)); // this is to make sure that combiner is only called once // since we can't handle no combine or multiple combines conf.setCombineOnceOnly(true); } if (pom.groupFuncs != null) { conf.set("pig.groupFuncs", ObjectSerializer.serialize(pom.groupFuncs)); } if (pom.toReduce != null) { conf.set("pig.reduceFunc", ObjectSerializer.serialize(pom.toReduce)); } if (pom.toSplit != null) { conf.set("pig.splitSpec", ObjectSerializer.serialize(pom.toSplit)); } if (pom.pigContext != null) { conf.set("pig.pigContext", ObjectSerializer.serialize(pom.pigContext)); } conf.setMapRunnerClass(PigMapReduce.class); if (pom.toCombine != null) { conf.setCombinerClass(PigCombine.class); //conf.setCombinerClass(PigMapReduce.class); } if (pom.quantilesFile != null) { conf.set("pig.quantilesFile", pom.quantilesFile); } else { // this is not a sort job - can use byte comparison to speed up processing conf.setOutputKeyComparatorClass(PigWritableComparator.class); } if (pom.partitionFunction != null) { conf.setPartitionerClass(SortPartitioner.class); } conf.setReducerClass(PigMapReduce.class); conf.setInputFormat(PigInputFormat.class); conf.setOutputFormat(PigOutputFormat.class); // not used starting with 0.15 conf.setInputKeyClass(Text.class); // not used starting with 0.15 conf.setInputValueClass(Tuple.class); conf.setOutputKeyClass(Tuple.class); if (pom.userComparator != null) { conf.setOutputKeyComparatorClass(pom.userComparator); } conf.setOutputValueClass(IndexedTuple.class); conf.set("pig.inputs", ObjectSerializer.serialize(pom.inputFileSpecs)); conf.setOutputPath(new Path(pom.outputFileSpec.getFileName())); conf.set("pig.storeFunc", ObjectSerializer.serialize(pom.outputFileSpec.getFuncSpec())); // Setup the DistributedCache for this job setupDistributedCache(pom.pigContext, conf, pom.properties, "pig.streaming.ship.files", true); setupDistributedCache(pom.pigContext, conf, pom.properties, "pig.streaming.cache.files", false); // Setup the logs directory for this job String jobOutputFileName = pom.pigContext.getJobOutputFile(); if (jobOutputFileName != null && jobOutputFileName.length() > 0) { Path jobOutputFile = new Path(pom.pigContext.getJobOutputFile()); conf.set("pig.output.dir", jobOutputFile.getParent().toString()); conf.set("pig.streaming.log.dir", new Path(jobOutputFile, LOG_DIR).toString()); } // // Now, actually submit the job (using the submit name) // JobClient jobClient = execEngine.getJobClient(); RunningJob status = jobClient.submitJob(conf); log.debug("submitted job: " + status.getJobID()); long sleepTime = 1000; double lastQueryProgress = -1.0; int lastJobsQueued = -1; double lastMapProgress = -1.0; double lastReduceProgress = -1.0; while (true) { try { Thread.sleep(sleepTime); } catch (Exception e) { } if (status.isComplete()) { success = status.isSuccessful(); if (log.isDebugEnabled()) { StringBuilder sb = new StringBuilder(); sb.append("Job finished "); sb.append((success ? "" : "un")); sb.append("successfully"); log.debug(sb.toString()); } if (success) { mrJobNumber++; } double queryProgress = ((double) mrJobNumber) / ((double) numMRJobs); if (queryProgress > lastQueryProgress) { if (log.isInfoEnabled()) { StringBuilder sbProgress = new StringBuilder(); sbProgress.append("Pig progress = "); sbProgress.append(((int) (queryProgress * 100))); sbProgress.append("%"); log.info(sbProgress.toString()); } lastQueryProgress = queryProgress; } break; } else // still running { double mapProgress = status.mapProgress(); double reduceProgress = status.reduceProgress(); if (lastMapProgress != mapProgress || lastReduceProgress != reduceProgress) { if (log.isDebugEnabled()) { StringBuilder sbProgress = new StringBuilder(); sbProgress.append("Hadoop job progress: Map="); sbProgress.append((int) (mapProgress * 100)); sbProgress.append("% Reduce="); sbProgress.append((int) (reduceProgress * 100)); sbProgress.append("%"); log.debug(sbProgress.toString()); } lastMapProgress = mapProgress; lastReduceProgress = reduceProgress; } double numJobsCompleted = mrJobNumber; double thisJobProgress = (mapProgress + reduceProgress) / 2.0; double queryProgress = (numJobsCompleted + thisJobProgress) / ((double) numMRJobs); if (queryProgress > lastQueryProgress) { if (log.isInfoEnabled()) { StringBuilder sbProgress = new StringBuilder(); sbProgress.append("Pig progress = "); sbProgress.append(((int) (queryProgress * 100))); sbProgress.append("%"); log.info(sbProgress.toString()); } lastQueryProgress = queryProgress; } } } // bug 1030028: if the input file is empty; hadoop doesn't create the output file! Path outputFile = conf.getOutputPath(); String outputName = outputFile.getName(); int colon = outputName.indexOf(':'); if (colon != -1) { outputFile = new Path(outputFile.getParent(), outputName.substring(0, colon)); } try { ElementDescriptor descriptor = ((HDataStorage) (pom.pigContext.getDfs())) .asElement(outputFile.toString()); if (success && !descriptor.exists()) { // create an empty output file PigFile f = new PigFile(outputFile.toString(), false); f.store(BagFactory.getInstance().newDefaultBag(), new PigStorage(), pom.pigContext); } } catch (DataStorageException e) { throw WrappedIOException.wrap("Failed to obtain descriptor for " + outputFile.toString(), e); } if (!success) { // go find the error messages getErrorMessages(jobClient.getMapTaskReports(status.getJobID()), "map"); getErrorMessages(jobClient.getReduceTaskReports(status.getJobID()), "reduce"); } else { long timeSpent = 0; // NOTE: this call is crashing due to a bug in Hadoop; the bug is known and the patch has not been applied yet. TaskReport[] mapReports = jobClient.getMapTaskReports(status.getJobID()); TaskReport[] reduceReports = jobClient.getReduceTaskReports(status.getJobID()); for (TaskReport r : mapReports) { timeSpent += (r.getFinishTime() - r.getStartTime()); } for (TaskReport r : reduceReports) { timeSpent += (r.getFinishTime() - r.getStartTime()); } totalHadoopTimeSpent += timeSpent; } } catch (Exception e) { // Do we need different handling for different exceptions e.printStackTrace(); throw WrappedIOException.wrap(e); } finally { submitJarFile.delete(); } return success; }