List of usage examples for org.apache.hadoop.mapred JobStatus isJobComplete
public synchronized boolean isJobComplete()
From source file:com.ikanow.infinit.e.core.mapreduce.HadoopJobRunner.java
License:Open Source License
/** * Checks any running/queued jobs and updates their status if they've completed *///from www .j a va 2s . co m public void updateJobStatus() { Map<ObjectId, String> incompleteJobsMap = new HashMap<ObjectId, String>(); //get mongo entries that have jobids? try { JobClient jc = null; CustomMapReduceJobPojo cmr = getJobsToMakeComplete(); while (cmr != null) { boolean markedComplete = false; //make sure its an actual ID, we now set jobidS to "" when running the job if (!cmr.jobidS.equals("")) { if (null == jc) { try { jc = new JobClient(getJobClientConnection(), new Configuration()); } catch (Exception e) { // Better delete this, no idea what's going on.... _logger.info( "job_update_status_error_title=" + cmr.jobtitle + " job_update_status_error_id=" + cmr._id.toString() + " job_update_status_error_message=Skipping job: " + cmr.jobidS + cmr.jobidN + ", this node does not run mapreduce"); setJobComplete(cmr, true, true, -1, -1, "Failed to launch job, unknown error (check configuration in /opt/hadoop-infinite/mapreduce/hadoop/, jobtracker may be localhost?)."); cmr = getJobsToMakeComplete(); continue; } } //check if job is done, and update if it is JobStatus[] jobs = jc.getAllJobs(); boolean bFound = false; for (JobStatus j : jobs) { if (j.getJobID().getJtIdentifier().equals(cmr.jobidS) && j.getJobID().getId() == cmr.jobidN) { bFound = true; boolean error = false; markedComplete = j.isJobComplete(); String errorMessage = null; if (JobStatus.FAILED == j.getRunState()) { markedComplete = true; error = true; errorMessage = "Job failed while running, check for errors in the mapper/reducer or that your key/value classes are set up correctly?"; } setJobComplete(cmr, markedComplete, error, j.mapProgress(), j.reduceProgress(), errorMessage); break; // (from mini loop over hadoop jobs, not main loop over infinite tasks) } } if (!bFound) { // Possible error //check if its been longer than 5min and mark job as complete (it failed to launch) Date currDate = new Date(); Date lastDate = cmr.lastRunTime; //if its been more than 5 min (5m*60s*1000ms) if (currDate.getTime() - lastDate.getTime() > 300000) { markedComplete = true; setJobComplete(cmr, true, true, -1, -1, "Failed to launch job, unknown error #2."); } } } else // this job hasn't been started yet: { //check if its been longer than 5min and mark job as complete (it failed to launch) Date currDate = new Date(); Date lastDate = cmr.lastRunTime; //if its been more than 5 min (5m*60s*1000ms) if (currDate.getTime() - lastDate.getTime() > 300000) { markedComplete = true; setJobComplete(cmr, true, true, -1, -1, "Failed to launch job, unknown error #1."); } } //job was not done, need to set flag back if (!markedComplete) { incompleteJobsMap.put(cmr._id, cmr.jobidS); } cmr = getJobsToMakeComplete(); } } catch (Exception ex) { _logger.info("job_error_checking_status_message=" + HarvestExceptionUtils.createExceptionMessage(ex)); } catch (Error err) { // Really really want to get to the next line of code, and clear the status... } //set all incomplete jobs back for (ObjectId id : incompleteJobsMap.keySet()) { BasicDBObject update = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, incompleteJobsMap.get(id)); DbManager.getCustom().getLookup().update(new BasicDBObject(CustomMapReduceJobPojo._id_, id), new BasicDBObject(MongoDbManager.set_, update)); } }
From source file:com.ikanow.infinit.e.processing.custom.CustomProcessingController.java
License:Open Source License
public boolean checkRunningJobs(CustomMapReduceJobPojo jobOverride) { Map<ObjectId, String> incompleteJobsMap = new HashMap<ObjectId, String>(); //get mongo entries that have jobids? try {/*from ww w . ja va 2 s .c om*/ JobClient jc = null; CustomMapReduceJobPojo cmr = jobOverride; if (null == cmr) cmr = CustomScheduleManager.getJobsToMakeComplete(_bHadoopEnabled, incompleteJobsMap); else if (null == cmr.jobidS) return true; while (cmr != null) { boolean markedComplete = false; //make sure its an actual ID, we now set jobidS to "" when running the job if (!cmr.jobidS.equals("")) // non null by construction { if (null == jc) { try { jc = new JobClient(InfiniteHadoopUtils.getJobClientConnection(prop_custom), new Configuration()); } catch (Exception e) { // Better delete this, no idea what's going on.... _logger.info( "job_update_status_error_title=" + cmr.jobtitle + " job_update_status_error_id=" + cmr._id.toString() + " job_update_status_error_message=Skipping job: " + cmr.jobidS + cmr.jobidN + ", this node does not run mapreduce"); _statusManager.setJobComplete(cmr, true, true, -1, -1, "Failed to launch job, unknown error (check configuration in /opt/hadoop-infinite/mapreduce/hadoop/, jobtracker may be localhost?)."); incompleteJobsMap.remove(cmr._id); cmr = CustomScheduleManager.getJobsToMakeComplete(_bHadoopEnabled, incompleteJobsMap); continue; } } //check if job is done, and update if it is JobStatus[] jobs = jc.getAllJobs(); boolean bFound = false; for (JobStatus j : jobs) { if (j.getJobID().getJtIdentifier().equals(cmr.jobidS) && j.getJobID().getId() == cmr.jobidN) { bFound = true; boolean error = false; markedComplete = j.isJobComplete(); String errorMessage = null; if (JobStatus.FAILED == j.getRunState()) { markedComplete = true; error = true; errorMessage = "Job failed while running, check for errors in the mapper/reducer or that your key/value classes are set up correctly? " + j.getFailureInfo(); } _statusManager.setJobComplete(cmr, markedComplete, error, j.mapProgress(), j.reduceProgress(), errorMessage); break; // (from mini loop over hadoop jobs, not main loop over infinite tasks) } } if (!bFound) { // Possible error //check if its been longer than 5min and mark job as complete (it failed to launch) Date currDate = new Date(); Date lastDate = cmr.lastRunTime; //if its been more than 5 min (5m*60s*1000ms) if (currDate.getTime() - lastDate.getTime() > 300000) { markedComplete = true; _statusManager.setJobComplete(cmr, true, true, -1, -1, "Failed to launch job, unknown error #2."); } } } else // this job hasn't been started yet: { //check if its been longer than 5min and mark job as complete (it failed to launch) Date currDate = new Date(); Date lastDate = cmr.lastRunTime; //if its been more than 5 min (5m*60s*1000ms) if (currDate.getTime() - lastDate.getTime() > 300000) { markedComplete = true; _statusManager.setJobComplete(cmr, true, true, -1, -1, "Failed to launch job, unknown error #1."); } } //job was done, remove flag if (markedComplete) { incompleteJobsMap.remove(cmr._id); } if (null == jobOverride) cmr = CustomScheduleManager.getJobsToMakeComplete(_bHadoopEnabled, incompleteJobsMap); else cmr = null; } } catch (Exception ex) { _logger.info("job_error_checking_status_message=" + InfiniteHadoopUtils.createExceptionMessage(ex)); } catch (Error err) { // Really really want to get to the next line of code, and clear the status... _logger.info("job_error_checking_status_message=" + InfiniteHadoopUtils.createExceptionMessage(err)); } if (null == jobOverride) { //set all incomplete jobs' status back for (ObjectId id : incompleteJobsMap.keySet()) { BasicDBObject update = new BasicDBObject(CustomMapReduceJobPojo.jobidS_, incompleteJobsMap.get(id)); DbManager.getCustom().getLookup().update(new BasicDBObject(CustomMapReduceJobPojo._id_, id), new BasicDBObject(MongoDbManager.set_, update)); } } return incompleteJobsMap.isEmpty(); }
From source file:com.impetus.ankush2.hadoop.monitor.JobStatusProvider.java
License:Open Source License
/** * @param jobClient/*w ww. ja v a 2 s .co m*/ * @param jobSts * @return * @throws IOException */ private Map<String, Object> getJobReport(JobStatus jobSts) throws IOException { // Creating an empty map for storing job information Map<String, Object> jobReport = new HashMap<String, Object>(); // Returns the jobid of the Job org.apache.hadoop.mapred.JobID jobId = jobSts.getJobID(); // Get an RunningJob object to track an ongoing Map-Reduce // job. RunningJob job = jobClient.getJob(jobId); String jobName = ""; if (job != null) { // Get the name of the job. jobName = job.getJobName(); } // Percentage of progress in maps float mapProgress = jobSts.mapProgress() * 100; // Percentage of progress in reduce float reduceProgress = jobSts.reduceProgress() * 100; int mapTotal = 0; int reduceTotal = 0; int mapComp = 0; int reduceComp = 0; // Count for Map and Reduce Complete try { // Get the information of the current state of the map // tasks of a job TaskReport[] mapTaskReports = jobClient.getMapTaskReports(jobId); // Get the total map mapTotal = mapTaskReports.length; // Iterating over the map tasks for (TaskReport taskReport : mapTaskReports) { // The current state of a map TaskInProgress as seen // by the JobTracker. TIPStatus currentStatus = taskReport.getCurrentStatus(); if (currentStatus == TIPStatus.COMPLETE) { mapComp++; } } // Get the information of the current state of the // reduce tasks of a job. TaskReport[] reduceTaskReport = jobClient.getReduceTaskReports(jobId); // Get the total reduce reduceTotal = reduceTaskReport.length; // Iterating over the reduce tasks for (TaskReport taskReport : reduceTaskReport) { // The current state of a reduce TaskInProgress as // seen by the JobTracker. TIPStatus currentStatus = taskReport.getCurrentStatus(); if (currentStatus == TIPStatus.COMPLETE) { reduceComp++; } } } catch (Exception e) { LOG.error(e.getMessage(), e); } // Percentage of progress in setup float setupProgress = jobSts.setupProgress() * 100; // The progress made on cleanup float cleanupProgress = jobSts.cleanupProgress() * 100; // gets any available info on the reason of failure of the // job..Returns the diagnostic information on why a job // might have failed. String failureInfo = jobSts.getFailureInfo(); // Putting Job Sttaus information in map jobReport.put("jobId", jobId.toString()); jobReport.put("jobName", jobName); jobReport.put("jobPriority", jobSts.getJobPriority().toString()); jobReport.put("jobStartTime", jobSts.getStartTime()); jobReport.put("userName", jobSts.getUsername()); jobReport.put("jobComplete", jobSts.isJobComplete()); jobReport.put("mapProgress", mapProgress); jobReport.put("reduceProgress", reduceProgress); jobReport.put("mapTotal", mapTotal); jobReport.put("reduceTotal", reduceTotal); jobReport.put("mapCompleted", mapComp); jobReport.put("reduceCompleted", reduceComp); jobReport.put("setupProgress", setupProgress); jobReport.put("cleanupProgress", cleanupProgress); jobReport.put("schedulingInfo", jobSts.getSchedulingInfo()); jobReport.put("jobState", JobStatus.getJobRunState(jobSts.getRunState())); jobReport.put("failureInfo", failureInfo); jobReport.put("jobFile", job.getJobFile()); jobReport.put("trackingURL", job.getTrackingURL()); jobReport.putAll(getDetailedJobReport(jobId)); return jobReport; }
From source file:eu.scape_project.tb.hadoopjobtracker.HadoobJobTrackerClient.java
License:Apache License
public HDJobStatus[] UUIDJobs(String UUID) { JobStatus[] allHadoopJobs = null;/*from www . ja v a 2s . co m*/ try { allHadoopJobs = myJobClient.getAllJobs(); } catch (IOException ex) { logger.error("Error retrieving ALL jobs from JobTracker. ERR: " + ex.getMessage()); } int numberOfJobs = allHadoopJobs.length; logger.info("Number of ALL jobs on the cluster: " + numberOfJobs); logger.info("Searching for jobs containing the UUID: '" + UUID + "'"); ArrayList<HDJobStatus> allUUIDJobs = new ArrayList<HDJobStatus>(); if (numberOfJobs > 0) { for (JobStatus singleJobStatus : allHadoopJobs) { JobID singleJobID = singleJobStatus.getJobID(); String singleJobName = "N/A"; try { singleJobName = getJobName(myJobClient, singleJobID); } catch (IOException ex) { logger.error("Error retrieving jobName for job " + singleJobID + ". ERR: " + ex.getMessage()); } if ((singleJobName.toLowerCase().indexOf(UUID.toLowerCase()) >= 0) || (UUID.equals(""))) { HDJobStatus newJobStatus = new HDJobStatus(); newJobStatus.setJobID(singleJobStatus.getJobID().toString()); newJobStatus.setJobIsComplete(singleJobStatus.isJobComplete()); newJobStatus.setJobUserName(singleJobStatus.getUsername()); newJobStatus.setJobFailureInfo(singleJobStatus.getFailureInfo()); newJobStatus.setJobName(singleJobName); newJobStatus.setJobMapProgress(Math.round(singleJobStatus.mapProgress() * 100)); newJobStatus.setJobReduceProgress(Math.round(singleJobStatus.reduceProgress() * 100)); allUUIDJobs.add(newJobStatus); } } } return allUUIDJobs.toArray(new HDJobStatus[allUUIDJobs.size()]); }
From source file:org.apache.accumulo.server.master.CoordinateRecoveryTask.java
License:Apache License
void cleanupOldJobs() { try {/* www .j a v a2 s . c om*/ Configuration conf = CachedConfiguration.getInstance(); @SuppressWarnings("deprecation") JobClient jc = new JobClient(new org.apache.hadoop.mapred.JobConf(conf)); for (JobStatus status : jc.getAllJobs()) { if (!status.isJobComplete()) { RunningJob job = jc.getJob(status.getJobID()); if (job.getJobName().equals(LogSort.getJobName())) { log.info("found a running " + job.getJobName()); Configuration jobConfig = new Configuration(false); log.info("fetching configuration from " + job.getJobFile()); jobConfig.addResource(TraceFileSystem .wrap(FileUtil.getFileSystem(conf, ServerConfiguration.getSiteConfiguration())) .open(new Path(job.getJobFile()))); if (HdfsZooInstance.getInstance().getInstanceID() .equals(jobConfig.get(LogSort.INSTANCE_ID_PROPERTY))) { log.info("Killing job " + job.getID().toString()); } } } } FileStatus[] children = fs.listStatus(new Path(ServerConstants.getRecoveryDir())); if (children != null) { for (FileStatus child : children) { log.info("Deleting recovery directory " + child); fs.delete(child.getPath(), true); } } } catch (IOException e) { log.error("Error cleaning up old Log Sort jobs" + e); } catch (Exception e) { log.error("Unknown error cleaning up old jobs", e); } }
From source file:org.apache.hive.hcatalog.templeton.tool.LaunchMapper.java
License:Apache License
/** * Attempts to reconnect to an already running child job of the templeton launcher. This * is used in cases where the templeton launcher task has failed and is retried by the * MR framework. If reconnect to the child job is possible, the method will continue * tracking its progress until completion. * @return Returns true if reconnect was successful, false if not supported or * no child jobs were found./*from w w w . j a v a 2s. c o m*/ */ private boolean tryReconnectToRunningJob(Configuration conf, Context context, LauncherDelegator.JobType jobType, String statusdir) throws IOException, InterruptedException { if (!reconnectToRunningJobEnabledAndSupported(conf, jobType)) { return false; } long startTime = getTempletonLaunchTime(conf); UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); WebHCatJTShim tracker = ShimLoader.getHadoopShims().getWebHCatShim(conf, ugi); try { Set<String> childJobs = tracker.getJobs(context.getJobID().toString(), startTime); if (childJobs.size() == 0) { LOG.info("No child jobs found to reconnect with"); return false; } if (childJobs.size() > 1) { LOG.warn(String.format("Found more than one child job to reconnect with: %s, skipping reconnect", Arrays.toString(childJobs.toArray()))); return false; } String childJobIdString = childJobs.iterator().next(); org.apache.hadoop.mapred.JobID childJobId = org.apache.hadoop.mapred.JobID.forName(childJobIdString); LOG.info(String.format("Reconnecting to an existing job %s", childJobIdString)); // Update job state with the childJob id updateJobStatePercentAndChildId(conf, context.getJobID().toString(), null, childJobIdString); do { org.apache.hadoop.mapred.JobStatus jobStatus = tracker.getJobStatus(childJobId); if (jobStatus.isJobComplete()) { LOG.info(String.format("Child job %s completed", childJobIdString)); int exitCode = 0; if (jobStatus.getRunState() != org.apache.hadoop.mapred.JobStatus.SUCCEEDED) { exitCode = 1; } updateJobStateToDoneAndWriteExitValue(conf, statusdir, context.getJobID().toString(), exitCode); break; } String percent = String.format("map %s%%, reduce %s%%", jobStatus.mapProgress() * 100, jobStatus.reduceProgress() * 100); updateJobStatePercentAndChildId(conf, context.getJobID().toString(), percent, null); LOG.info("KeepAlive Heart beat"); context.progress(); Thread.sleep(POLL_JOBPROGRESS_MSEC); } while (true); // Reconnect was successful return true; } catch (IOException ex) { LOG.error("Exception encountered in tryReconnectToRunningJob", ex); throw ex; } finally { tracker.close(); } }