List of usage examples for org.apache.hadoop.mapred JobStatus getRunState
public synchronized int getRunState()
From source file:com.ikanow.infinit.e.core.mapreduce.HadoopJobRunner.java
License:Open Source License
/** * Checks any running/queued jobs and updates their status if they've completed *//*from w ww .j a v a 2 s. c om*/ public void updateJobStatus() { Map<ObjectId, String> incompleteJobsMap = new HashMap<ObjectId, String>(); //get mongo entries that have jobids? try { JobClient jc = null; CustomMapReduceJobPojo cmr = getJobsToMakeComplete(); while (cmr != null) { boolean markedComplete = false; //make sure its an actual ID, we now set jobidS to "" when running the job if (!cmr.jobidS.equals("")) { if (null == jc) { try { jc = new JobClient(getJobClientConnection(), new Configuration()); } catch (Exception e) { // Better delete this, no idea what's going on.... _logger.info( "job_update_status_error_title=" + cmr.jobtitle + " job_update_status_error_id=" + cmr._id.toString() + " job_update_status_error_message=Skipping job: " + cmr.jobidS + cmr.jobidN + ", this node does not run mapreduce"); setJobComplete(cmr, true, true, -1, -1, "Failed to launch job, unknown error (check configuration in /opt/hadoop-infinite/mapreduce/hadoop/, jobtracker may be localhost?)."); cmr = getJobsToMakeComplete(); continue; } } //check if job is done, and update if it is JobStatus[] jobs = jc.getAllJobs(); boolean bFound = false; for (JobStatus j : jobs) { if (j.getJobID().getJtIdentifier().equals(cmr.jobidS) && j.getJobID().getId() == cmr.jobidN) { bFound = true; boolean error = false; markedComplete = j.isJobComplete(); String errorMessage = null; if (JobStatus.FAILED == j.getRunState()) { markedComplete = true; error = true; errorMessage = "Job failed while running, check for errors in the mapper/reducer or that your key/value classes are set up correctly?"; } setJobComplete(cmr, markedComplete, error, j.mapProgress(), j.reduceProgress(), errorMessage); break; // (from mini loop over hadoop jobs, not main loop over infinite tasks) } } if (!bFound) { // Possible error //check if its been longer than 5min and mark job as complete (it failed to launch) Date currDate = new Date(); Date lastDate = cmr.lastRunTime; //if its been more than 5 min (5m*60s*1000ms) if (currDate.getTime() - lastDate.getTime() > 300000) { markedComplete = true; setJobComplete(cmr, true, true, -1, -1, "Failed to launch job, unknown error #2."); } } } else // this job hasn't been started yet: { //check if its been longer than 5min and mark job as complete (it failed to launch) Date currDate = new Date(); Date lastDate = cmr.lastRunTime; //if its been more than 5 min (5m*60s*1000ms) if (currDate.getTime() - lastDate.getTime() > 300000) { markedComplete = true; setJobComplete(cmr, true, true, -1, -1, "Failed to launch job, unknown error #1."); } } //job was not done, need to set flag back if (!markedComplete) { incompleteJobsMap.put(cmr._id, cmr.jobidS); } cmr = getJobsToMakeComplete(); } } catch (Exception ex) { _logger.info("job_error_checking_status_message=" + HarvestExceptionUtils.createExceptionMessage(ex)); } catch (Error err) { // Really really want to get to the next line of code, and clear the status... } //set all incomplete jobs back for (ObjectId id : incompleteJobsMap.keySet()) { BasicDBObject update = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, incompleteJobsMap.get(id)); DbManager.getCustom().getLookup().update(new BasicDBObject(CustomMapReduceJobPojo._id_, id), new BasicDBObject(MongoDbManager.set_, update)); } }
From source file:com.ikanow.infinit.e.processing.custom.CustomProcessingController.java
License:Open Source License
public boolean checkRunningJobs(CustomMapReduceJobPojo jobOverride) { Map<ObjectId, String> incompleteJobsMap = new HashMap<ObjectId, String>(); //get mongo entries that have jobids? try {/*from w ww . j a v a2s.c om*/ JobClient jc = null; CustomMapReduceJobPojo cmr = jobOverride; if (null == cmr) cmr = CustomScheduleManager.getJobsToMakeComplete(_bHadoopEnabled, incompleteJobsMap); else if (null == cmr.jobidS) return true; while (cmr != null) { boolean markedComplete = false; //make sure its an actual ID, we now set jobidS to "" when running the job if (!cmr.jobidS.equals("")) // non null by construction { if (null == jc) { try { jc = new JobClient(InfiniteHadoopUtils.getJobClientConnection(prop_custom), new Configuration()); } catch (Exception e) { // Better delete this, no idea what's going on.... _logger.info( "job_update_status_error_title=" + cmr.jobtitle + " job_update_status_error_id=" + cmr._id.toString() + " job_update_status_error_message=Skipping job: " + cmr.jobidS + cmr.jobidN + ", this node does not run mapreduce"); _statusManager.setJobComplete(cmr, true, true, -1, -1, "Failed to launch job, unknown error (check configuration in /opt/hadoop-infinite/mapreduce/hadoop/, jobtracker may be localhost?)."); incompleteJobsMap.remove(cmr._id); cmr = CustomScheduleManager.getJobsToMakeComplete(_bHadoopEnabled, incompleteJobsMap); continue; } } //check if job is done, and update if it is JobStatus[] jobs = jc.getAllJobs(); boolean bFound = false; for (JobStatus j : jobs) { if (j.getJobID().getJtIdentifier().equals(cmr.jobidS) && j.getJobID().getId() == cmr.jobidN) { bFound = true; boolean error = false; markedComplete = j.isJobComplete(); String errorMessage = null; if (JobStatus.FAILED == j.getRunState()) { markedComplete = true; error = true; errorMessage = "Job failed while running, check for errors in the mapper/reducer or that your key/value classes are set up correctly? " + j.getFailureInfo(); } _statusManager.setJobComplete(cmr, markedComplete, error, j.mapProgress(), j.reduceProgress(), errorMessage); break; // (from mini loop over hadoop jobs, not main loop over infinite tasks) } } if (!bFound) { // Possible error //check if its been longer than 5min and mark job as complete (it failed to launch) Date currDate = new Date(); Date lastDate = cmr.lastRunTime; //if its been more than 5 min (5m*60s*1000ms) if (currDate.getTime() - lastDate.getTime() > 300000) { markedComplete = true; _statusManager.setJobComplete(cmr, true, true, -1, -1, "Failed to launch job, unknown error #2."); } } } else // this job hasn't been started yet: { //check if its been longer than 5min and mark job as complete (it failed to launch) Date currDate = new Date(); Date lastDate = cmr.lastRunTime; //if its been more than 5 min (5m*60s*1000ms) if (currDate.getTime() - lastDate.getTime() > 300000) { markedComplete = true; _statusManager.setJobComplete(cmr, true, true, -1, -1, "Failed to launch job, unknown error #1."); } } //job was done, remove flag if (markedComplete) { incompleteJobsMap.remove(cmr._id); } if (null == jobOverride) cmr = CustomScheduleManager.getJobsToMakeComplete(_bHadoopEnabled, incompleteJobsMap); else cmr = null; } } catch (Exception ex) { _logger.info("job_error_checking_status_message=" + InfiniteHadoopUtils.createExceptionMessage(ex)); } catch (Error err) { // Really really want to get to the next line of code, and clear the status... _logger.info("job_error_checking_status_message=" + InfiniteHadoopUtils.createExceptionMessage(err)); } if (null == jobOverride) { //set all incomplete jobs' status back for (ObjectId id : incompleteJobsMap.keySet()) { BasicDBObject update = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, incompleteJobsMap.get(id)); DbManager.getCustom().getLookup().update(new BasicDBObject(CustomMapReduceJobPojo._id_, id), new BasicDBObject(MongoDbManager.set_, update)); } } return incompleteJobsMap.isEmpty(); }
From source file:com.impetus.ankush2.hadoop.monitor.JobStatusProvider.java
License:Open Source License
/** * @param jobClient/*from ww w. j a v a 2 s. c om*/ * @param jobSts * @return * @throws IOException */ private Map<String, Object> getJobReport(JobStatus jobSts) throws IOException { // Creating an empty map for storing job information Map<String, Object> jobReport = new HashMap<String, Object>(); // Returns the jobid of the Job org.apache.hadoop.mapred.JobID jobId = jobSts.getJobID(); // Get an RunningJob object to track an ongoing Map-Reduce // job. RunningJob job = jobClient.getJob(jobId); String jobName = ""; if (job != null) { // Get the name of the job. jobName = job.getJobName(); } // Percentage of progress in maps float mapProgress = jobSts.mapProgress() * 100; // Percentage of progress in reduce float reduceProgress = jobSts.reduceProgress() * 100; int mapTotal = 0; int reduceTotal = 0; int mapComp = 0; int reduceComp = 0; // Count for Map and Reduce Complete try { // Get the information of the current state of the map // tasks of a job TaskReport[] mapTaskReports = jobClient.getMapTaskReports(jobId); // Get the total map mapTotal = mapTaskReports.length; // Iterating over the map tasks for (TaskReport taskReport : mapTaskReports) { // The current state of a map TaskInProgress as seen // by the JobTracker. TIPStatus currentStatus = taskReport.getCurrentStatus(); if (currentStatus == TIPStatus.COMPLETE) { mapComp++; } } // Get the information of the current state of the // reduce tasks of a job. TaskReport[] reduceTaskReport = jobClient.getReduceTaskReports(jobId); // Get the total reduce reduceTotal = reduceTaskReport.length; // Iterating over the reduce tasks for (TaskReport taskReport : reduceTaskReport) { // The current state of a reduce TaskInProgress as // seen by the JobTracker. TIPStatus currentStatus = taskReport.getCurrentStatus(); if (currentStatus == TIPStatus.COMPLETE) { reduceComp++; } } } catch (Exception e) { LOG.error(e.getMessage(), e); } // Percentage of progress in setup float setupProgress = jobSts.setupProgress() * 100; // The progress made on cleanup float cleanupProgress = jobSts.cleanupProgress() * 100; // gets any available info on the reason of failure of the // job..Returns the diagnostic information on why a job // might have failed. String failureInfo = jobSts.getFailureInfo(); // Putting Job Sttaus information in map jobReport.put("jobId", jobId.toString()); jobReport.put("jobName", jobName); jobReport.put("jobPriority", jobSts.getJobPriority().toString()); jobReport.put("jobStartTime", jobSts.getStartTime()); jobReport.put("userName", jobSts.getUsername()); jobReport.put("jobComplete", jobSts.isJobComplete()); jobReport.put("mapProgress", mapProgress); jobReport.put("reduceProgress", reduceProgress); jobReport.put("mapTotal", mapTotal); jobReport.put("reduceTotal", reduceTotal); jobReport.put("mapCompleted", mapComp); jobReport.put("reduceCompleted", reduceComp); jobReport.put("setupProgress", setupProgress); jobReport.put("cleanupProgress", cleanupProgress); jobReport.put("schedulingInfo", jobSts.getSchedulingInfo()); jobReport.put("jobState", JobStatus.getJobRunState(jobSts.getRunState())); jobReport.put("failureInfo", failureInfo); jobReport.put("jobFile", job.getJobFile()); jobReport.put("trackingURL", job.getTrackingURL()); jobReport.putAll(getDetailedJobReport(jobId)); return jobReport; }
From source file:org.apache.hive.hcatalog.templeton.tool.LaunchMapper.java
License:Apache License
/** * Attempts to reconnect to an already running child job of the templeton launcher. This * is used in cases where the templeton launcher task has failed and is retried by the * MR framework. If reconnect to the child job is possible, the method will continue * tracking its progress until completion. * @return Returns true if reconnect was successful, false if not supported or * no child jobs were found./* w ww. j av a 2s .c o m*/ */ private boolean tryReconnectToRunningJob(Configuration conf, Context context, LauncherDelegator.JobType jobType, String statusdir) throws IOException, InterruptedException { if (!reconnectToRunningJobEnabledAndSupported(conf, jobType)) { return false; } long startTime = getTempletonLaunchTime(conf); UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); WebHCatJTShim tracker = ShimLoader.getHadoopShims().getWebHCatShim(conf, ugi); try { Set<String> childJobs = tracker.getJobs(context.getJobID().toString(), startTime); if (childJobs.size() == 0) { LOG.info("No child jobs found to reconnect with"); return false; } if (childJobs.size() > 1) { LOG.warn(String.format("Found more than one child job to reconnect with: %s, skipping reconnect", Arrays.toString(childJobs.toArray()))); return false; } String childJobIdString = childJobs.iterator().next(); org.apache.hadoop.mapred.JobID childJobId = org.apache.hadoop.mapred.JobID.forName(childJobIdString); LOG.info(String.format("Reconnecting to an existing job %s", childJobIdString)); // Update job state with the childJob id updateJobStatePercentAndChildId(conf, context.getJobID().toString(), null, childJobIdString); do { org.apache.hadoop.mapred.JobStatus jobStatus = tracker.getJobStatus(childJobId); if (jobStatus.isJobComplete()) { LOG.info(String.format("Child job %s completed", childJobIdString)); int exitCode = 0; if (jobStatus.getRunState() != org.apache.hadoop.mapred.JobStatus.SUCCEEDED) { exitCode = 1; } updateJobStateToDoneAndWriteExitValue(conf, statusdir, context.getJobID().toString(), exitCode); break; } String percent = String.format("map %s%%, reduce %s%%", jobStatus.mapProgress() * 100, jobStatus.reduceProgress() * 100); updateJobStatePercentAndChildId(conf, context.getJobID().toString(), percent, null); LOG.info("KeepAlive Heart beat"); context.progress(); Thread.sleep(POLL_JOBPROGRESS_MSEC); } while (true); // Reconnect was successful return true; } catch (IOException ex) { LOG.error("Exception encountered in tryReconnectToRunningJob", ex); throw ex; } finally { tracker.close(); } }
From source file:org.estado.core.JobStatusChecker.java
License:Apache License
public void checkStatus() { List<org.estado.spi.JobStatus> jobStatusList = new ArrayList<org.estado.spi.JobStatus>(); try {/*from w w w.j ava2s . c om*/ Configuration conf = new Configuration(); JobClient client = new JobClient(new JobConf(conf)); JobStatus[] jobStatuses = client.getAllJobs(); showFilter(); int jobCount = 0; for (JobStatus jobStatus : jobStatuses) { Long lastTaskEndTime = 0L; TaskReport[] mapReports = client.getMapTaskReports(jobStatus.getJobID()); for (TaskReport r : mapReports) { if (lastTaskEndTime < r.getFinishTime()) { lastTaskEndTime = r.getFinishTime(); } } TaskReport[] reduceReports = client.getReduceTaskReports(jobStatus.getJobID()); for (TaskReport r : reduceReports) { if (lastTaskEndTime < r.getFinishTime()) { lastTaskEndTime = r.getFinishTime(); } } client.getSetupTaskReports(jobStatus.getJobID()); client.getCleanupTaskReports(jobStatus.getJobID()); String jobId = jobStatus.getJobID().toString(); String jobName = client.getJob(jobStatus.getJobID()).getJobName(); Long startTime = jobStatus.getStartTime(); String user = jobStatus.getUsername(); int mapProgress = (int) (jobStatus.mapProgress() * 100); int reduceProgress = (int) (jobStatus.reduceProgress() * 100); org.estado.spi.JobStatus jobStat = null; ++jobCount; int runState = jobStatus.getRunState(); switch (runState) { case JobStatus.SUCCEEDED: if (filter.contains("s")) { Long duration = lastTaskEndTime - jobStatus.getStartTime(); jobStat = new org.estado.spi.JobStatus(cluster, jobId, jobName, null, user, startTime, lastTaskEndTime, duration, mapProgress, reduceProgress, "completed"); ++sCount; } break; case JobStatus.RUNNING: if (filter.contains("r")) { long duration = System.currentTimeMillis() - jobStatus.getStartTime(); jobStat = new org.estado.spi.JobStatus(cluster, jobId, jobName, null, user, startTime, lastTaskEndTime, duration, mapProgress, reduceProgress, "running"); ++rCount; } break; case JobStatus.FAILED: if (filter.contains("f")) { long duration = lastTaskEndTime - jobStatus.getStartTime(); jobStat = new org.estado.spi.JobStatus(cluster, jobId, jobName, null, user, startTime, lastTaskEndTime, duration, mapProgress, reduceProgress, "failed"); RunningJob job = client.getJob(jobStatus.getJobID()); jobStat.setJobTasks(getTaskDetails(job)); ++fCount; } break; case JobStatus.PREP: if (filter.contains("p")) { jobStat = new org.estado.spi.JobStatus(cluster, jobId, jobName, null, user, null, null, null, 0, 0, "preparing"); ++pCount; } break; case JobStatus.KILLED: if (filter.contains("k")) { long duration = lastTaskEndTime - jobStatus.getStartTime(); jobStat = new org.estado.spi.JobStatus(cluster, jobId, jobName, null, user, startTime, lastTaskEndTime, duration, mapProgress, reduceProgress, "killed"); RunningJob job = client.getJob(jobStatus.getJobID()); jobStat.setJobTasks(getTaskDetails(job)); ++kCount; } break; } jobStatusList.add(jobStat); } //get counters for (org.estado.spi.JobStatus jobStat : jobStatusList) { if (!jobStat.getStatus().equals("preparing")) { List<JobCounterGroup> counterGroups = getJobCounters(jobStat.getJobId()); jobStat.setCounterGroups(counterGroups); //additional data from counters setJobInfo(jobStat); } } //publish to all consumers for (JobStatusConsumer consumer : consumers) { consumer.handle(jobStatusList); } showJobCounts(); } catch (Exception ex) { System.out.println("Jobs status checker failed" + ex.getMessage()); } }