List of usage examples for org.apache.hadoop.mapreduce JobID getId
public int getId()
From source file:com.marklogic.contentpump.LocalJobRunner.java
License:Apache License
/** * Run the job. Get the input splits, create map tasks and submit it to * the thread pool if there is one; otherwise, runs the the task one by * one./*from ww w .j a va 2s. co m*/ * * @param <INKEY> * @param <INVALUE> * @param <OUTKEY> * @param <OUTVALUE> * @throws Exception */ @SuppressWarnings("unchecked") public <INKEY, INVALUE, OUTKEY, OUTVALUE, T extends org.apache.hadoop.mapreduce.InputSplit> void run() throws Exception { Configuration conf = job.getConfiguration(); InputFormat<INKEY, INVALUE> inputFormat = (InputFormat<INKEY, INVALUE>) ReflectionUtils .newInstance(job.getInputFormatClass(), conf); List<InputSplit> splits = inputFormat.getSplits(job); T[] array = (T[]) splits.toArray(new org.apache.hadoop.mapreduce.InputSplit[splits.size()]); // sort the splits into order based on size, so that the biggest // goes first Arrays.sort(array, new SplitLengthComparator()); OutputFormat<OUTKEY, OUTVALUE> outputFormat = (OutputFormat<OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(job.getOutputFormatClass(), conf); Class<? extends Mapper<?, ?, ?, ?>> mapperClass = job.getMapperClass(); Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper = (Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(mapperClass, conf); try { outputFormat.checkOutputSpecs(job); } catch (Exception ex) { if (LOG.isDebugEnabled()) { LOG.debug("Error checking output specification: ", ex); } else { LOG.error("Error checking output specification: "); LOG.error(ex.getMessage()); } return; } conf = job.getConfiguration(); progress = new AtomicInteger[splits.size()]; for (int i = 0; i < splits.size(); i++) { progress[i] = new AtomicInteger(); } Monitor monitor = new Monitor(); monitor.start(); reporter = new ContentPumpReporter(); List<Future<Object>> taskList = new ArrayList<Future<Object>>(); for (int i = 0; i < array.length; i++) { InputSplit split = array[i]; if (pool != null) { LocalMapTask<INKEY, INVALUE, OUTKEY, OUTVALUE> task = new LocalMapTask<INKEY, INVALUE, OUTKEY, OUTVALUE>( inputFormat, outputFormat, conf, i, split, reporter, progress[i]); availableThreads = assignThreads(i, array.length); Class<? extends Mapper<?, ?, ?, ?>> runtimeMapperClass = job.getMapperClass(); if (availableThreads > 1 && availableThreads != threadsPerSplit) { // possible runtime adjustment if (runtimeMapperClass != (Class) MultithreadedMapper.class) { runtimeMapperClass = (Class<? extends Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>>) cmd .getRuntimeMapperClass(job, mapperClass, threadsPerSplit, availableThreads); } if (runtimeMapperClass != mapperClass) { task.setMapperClass(runtimeMapperClass); } if (runtimeMapperClass == (Class) MultithreadedMapper.class) { task.setThreadCount(availableThreads); if (LOG.isDebugEnabled()) { LOG.debug("Thread Count for Split#" + i + " : " + availableThreads); } } } if (runtimeMapperClass == (Class) MultithreadedMapper.class) { synchronized (pool) { taskList.add(pool.submit(task)); pool.wait(); } } else { pool.submit(task); } } else { // single-threaded JobID jid = new JobID(); TaskID taskId = new TaskID(jid.getJtIdentifier(), jid.getId(), TaskType.MAP, i); TaskAttemptID taskAttemptId = new TaskAttemptID(taskId, 0); TaskAttemptContext context = ReflectionUtil.createTaskAttemptContext(conf, taskAttemptId); RecordReader<INKEY, INVALUE> reader = inputFormat.createRecordReader(split, context); RecordWriter<OUTKEY, OUTVALUE> writer = outputFormat.getRecordWriter(context); OutputCommitter committer = outputFormat.getOutputCommitter(context); TrackingRecordReader trackingReader = new TrackingRecordReader(reader, progress[i]); Mapper.Context mapperContext = ReflectionUtil.createMapperContext(mapper, conf, taskAttemptId, trackingReader, writer, committer, reporter, split); trackingReader.initialize(split, mapperContext); // no thread pool (only 1 thread specified) Class<? extends Mapper<?, ?, ?, ?>> mapClass = job.getMapperClass(); mapperContext.getConfiguration().setClass(CONF_MAPREDUCE_JOB_MAP_CLASS, mapClass, Mapper.class); mapper = (Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils.newInstance(mapClass, mapperContext.getConfiguration()); mapper.run(mapperContext); trackingReader.close(); writer.close(mapperContext); committer.commitTask(context); } } // wait till all tasks are done if (pool != null) { for (Future<Object> f : taskList) { f.get(); } pool.shutdown(); while (!pool.awaitTermination(1, TimeUnit.DAYS)) ; jobComplete.set(true); } monitor.interrupt(); monitor.join(1000); // report counters Iterator<CounterGroup> groupIt = reporter.counters.iterator(); while (groupIt.hasNext()) { CounterGroup group = groupIt.next(); LOG.info(group.getDisplayName() + ": "); Iterator<Counter> counterIt = group.iterator(); while (counterIt.hasNext()) { Counter counter = counterIt.next(); LOG.info(counter.getDisplayName() + ": " + counter.getValue()); } } LOG.info("Total execution time: " + (System.currentTimeMillis() - startTime) / 1000 + " sec"); }
From source file:datafu.hourglass.jobs.StagedOutputJob.java
License:Apache License
/** * Writes Hadoop counters and other task statistics to a file in the file system. * //from w w w . j a v a 2 s . co m * @param fs * @throws IOException */ private void writeCounters(final FileSystem fs) throws IOException { final Path actualOutputPath = FileOutputFormat.getOutputPath(this); SimpleDateFormat timestampFormat = new SimpleDateFormat("yyyyMMddHHmmss"); String suffix = timestampFormat.format(new Date()); if (_countersParentPath != null) { if (!fs.exists(_countersParentPath)) { _log.info("Creating counter parent path " + _countersParentPath); fs.mkdirs(_countersParentPath, FsPermission.valueOf("-rwxrwxr-x")); } // make the name as unique as possible in this case because this may be a directory // where other counter files will be dropped _countersPath = new Path(_countersParentPath, ".counters." + suffix); } else { _countersPath = new Path(actualOutputPath, ".counters." + suffix); } _log.info(String.format("Writing counters to %s", _countersPath)); FSDataOutputStream counterStream = fs.create(_countersPath); BufferedOutputStream buffer = new BufferedOutputStream(counterStream, 256 * 1024); OutputStreamWriter writer = new OutputStreamWriter(buffer); for (String groupName : getCounters().getGroupNames()) { for (Counter counter : getCounters().getGroup(groupName)) { writeAndLog(writer, String.format("%s=%d", counter.getName(), counter.getValue())); } } JobID jobID = this.getJobID(); org.apache.hadoop.mapred.JobID oldJobId = new org.apache.hadoop.mapred.JobID(jobID.getJtIdentifier(), jobID.getId()); long minStart = Long.MAX_VALUE; long maxFinish = 0; long setupStart = Long.MAX_VALUE; long cleanupFinish = 0; DescriptiveStatistics mapStats = new DescriptiveStatistics(); DescriptiveStatistics reduceStats = new DescriptiveStatistics(); boolean success = true; JobClient jobClient = new JobClient(this.conf); Map<String, String> taskIdToType = new HashMap<String, String>(); TaskReport[] setupReports = jobClient.getSetupTaskReports(oldJobId); if (setupReports.length > 0) { _log.info("Processing setup reports"); for (TaskReport report : jobClient.getSetupTaskReports(oldJobId)) { taskIdToType.put(report.getTaskID().toString(), "SETUP"); if (report.getStartTime() == 0) { _log.warn("Skipping report with zero start time"); continue; } setupStart = Math.min(setupStart, report.getStartTime()); } } else { _log.error("No setup reports"); } TaskReport[] mapReports = jobClient.getMapTaskReports(oldJobId); if (mapReports.length > 0) { _log.info("Processing map reports"); for (TaskReport report : mapReports) { taskIdToType.put(report.getTaskID().toString(), "MAP"); if (report.getFinishTime() == 0 || report.getStartTime() == 0) { _log.warn("Skipping report with zero start or finish time"); continue; } minStart = Math.min(minStart, report.getStartTime()); mapStats.addValue(report.getFinishTime() - report.getStartTime()); } } else { _log.error("No map reports"); } TaskReport[] reduceReports = jobClient.getReduceTaskReports(oldJobId); if (reduceReports.length > 0) { _log.info("Processing reduce reports"); for (TaskReport report : reduceReports) { taskIdToType.put(report.getTaskID().toString(), "REDUCE"); if (report.getFinishTime() == 0 || report.getStartTime() == 0) { _log.warn("Skipping report with zero start or finish time"); continue; } maxFinish = Math.max(maxFinish, report.getFinishTime()); reduceStats.addValue(report.getFinishTime() - report.getStartTime()); } } else { _log.error("No reduce reports"); } TaskReport[] cleanupReports = jobClient.getCleanupTaskReports(oldJobId); if (cleanupReports.length > 0) { _log.info("Processing cleanup reports"); for (TaskReport report : cleanupReports) { taskIdToType.put(report.getTaskID().toString(), "CLEANUP"); if (report.getFinishTime() == 0) { _log.warn("Skipping report with finish time of zero"); continue; } cleanupFinish = Math.max(cleanupFinish, report.getFinishTime()); } } else { _log.error("No cleanup reports"); } if (minStart == Long.MAX_VALUE) { _log.error("Could not determine map-reduce start time"); success = false; } if (maxFinish == 0) { _log.error("Could not determine map-reduce finish time"); success = false; } if (setupStart == Long.MAX_VALUE) { _log.error("Could not determine setup start time"); success = false; } if (cleanupFinish == 0) { _log.error("Could not determine cleanup finish time"); success = false; } // Collect statistics on successful/failed/killed task attempts, categorized by setup/map/reduce/cleanup. // Unfortunately the job client doesn't have an easier way to get these statistics. Map<String, Integer> attemptStats = new HashMap<String, Integer>(); _log.info("Processing task attempts"); for (TaskCompletionEvent event : getTaskCompletionEvents(jobClient, oldJobId)) { String type = taskIdToType.get(event.getTaskAttemptId().getTaskID().toString()); String status = event.getTaskStatus().toString(); String key = String.format("%s_%s_ATTEMPTS", status, type); if (!attemptStats.containsKey(key)) { attemptStats.put(key, 0); } attemptStats.put(key, attemptStats.get(key) + 1); } if (success) { writeAndLog(writer, String.format("SETUP_START_TIME_MS=%d", setupStart)); writeAndLog(writer, String.format("CLEANUP_FINISH_TIME_MS=%d", cleanupFinish)); writeAndLog(writer, String.format("COMPLETE_WALL_CLOCK_TIME_MS=%d", cleanupFinish - setupStart)); writeAndLog(writer, String.format("MAP_REDUCE_START_TIME_MS=%d", minStart)); writeAndLog(writer, String.format("MAP_REDUCE_FINISH_TIME_MS=%d", maxFinish)); writeAndLog(writer, String.format("MAP_REDUCE_WALL_CLOCK_TIME_MS=%d", maxFinish - minStart)); writeAndLog(writer, String.format("MAP_TOTAL_TASKS=%d", (long) mapStats.getN())); writeAndLog(writer, String.format("MAP_MAX_TIME_MS=%d", (long) mapStats.getMax())); writeAndLog(writer, String.format("MAP_MIN_TIME_MS=%d", (long) mapStats.getMin())); writeAndLog(writer, String.format("MAP_AVG_TIME_MS=%d", (long) mapStats.getMean())); writeAndLog(writer, String.format("MAP_STD_TIME_MS=%d", (long) mapStats.getStandardDeviation())); writeAndLog(writer, String.format("MAP_SUM_TIME_MS=%d", (long) mapStats.getSum())); writeAndLog(writer, String.format("REDUCE_TOTAL_TASKS=%d", (long) reduceStats.getN())); writeAndLog(writer, String.format("REDUCE_MAX_TIME_MS=%d", (long) reduceStats.getMax())); writeAndLog(writer, String.format("REDUCE_MIN_TIME_MS=%d", (long) reduceStats.getMin())); writeAndLog(writer, String.format("REDUCE_AVG_TIME_MS=%d", (long) reduceStats.getMean())); writeAndLog(writer, String.format("REDUCE_STD_TIME_MS=%d", (long) reduceStats.getStandardDeviation())); writeAndLog(writer, String.format("REDUCE_SUM_TIME_MS=%d", (long) reduceStats.getSum())); writeAndLog(writer, String.format("MAP_REDUCE_SUM_TIME_MS=%d", (long) mapStats.getSum() + (long) reduceStats.getSum())); for (Map.Entry<String, Integer> attemptStat : attemptStats.entrySet()) { writeAndLog(writer, String.format("%s=%d", attemptStat.getKey(), attemptStat.getValue())); } } writer.close(); buffer.close(); counterStream.close(); }
From source file:hydrograph.engine.cascading.flow.CustomFlowStepListener.java
License:Apache License
@Override public boolean onStepThrowable(FlowStep flowstep, Throwable arg1) { for (Flow flow : flows) { List<FlowStepStats> flows = flow.getFlowStats().getFlowStepStats(); for (FlowStepStats flowStat : flows) { HadoopStepStats stats = (HadoopStepStats) flowStat; try { RunningJob runningJob = stats.getJobStatusClient(); if (runningJob != null) { JobID jobID = runningJob.getID(); LOG.error("Killing Job " + jobID.getId()); runningJob.killJob(); LOG.info("Job: '" + jobID.getId() + "' started at: " + stats.getStartTime() + " killed successfully!"); }//from w w w . j av a2 s . c o m } catch (Exception e) { LOG.error("", e); throw new RuntimeException(e); } } } return true; }
From source file:org.apache.ignite.client.hadoop.GridHadoopClientProtocol.java
License:Apache License
/** {@inheritDoc} */ @Override/*from w ww.jav a 2s.c o m*/ public JobStatus submitJob(JobID jobId, String jobSubmitDir, Credentials ts) throws IOException, InterruptedException { try { conf.setLong(JOB_SUBMISSION_START_TS_PROPERTY, U.currentTimeMillis()); GridHadoopJobStatus status = cli.compute().execute(GridHadoopProtocolSubmitJobTask.class.getName(), new GridHadoopProtocolTaskArguments(jobId.getJtIdentifier(), jobId.getId(), createJobInfo(conf))); assert status != null; return processStatus(status); } catch (GridClientException | IgniteCheckedException e) { throw new IOException("Failed to submit job.", e); } }
From source file:org.apache.ignite.client.hadoop.GridHadoopClientProtocol.java
License:Apache License
/** {@inheritDoc} */ @Override// w w w .ja v a2 s. c o m public void killJob(JobID jobId) throws IOException, InterruptedException { try { cli.compute().execute(GridHadoopProtocolKillJobTask.class.getName(), new GridHadoopProtocolTaskArguments(jobId.getJtIdentifier(), jobId.getId())); } catch (GridClientException e) { throw new IOException("Failed to kill job: " + jobId, e); } }
From source file:org.apache.ignite.client.hadoop.GridHadoopClientProtocol.java
License:Apache License
/** {@inheritDoc} */ @Override/*from ww w. j ava2s . com*/ public JobStatus getJobStatus(JobID jobId) throws IOException, InterruptedException { try { Long delay = conf.getLong(GridHadoopJobProperty.JOB_STATUS_POLL_DELAY.propertyName(), -1); GridHadoopProtocolTaskArguments args = delay >= 0 ? new GridHadoopProtocolTaskArguments(jobId.getJtIdentifier(), jobId.getId(), delay) : new GridHadoopProtocolTaskArguments(jobId.getJtIdentifier(), jobId.getId()); GridHadoopJobStatus status = cli.compute().execute(GridHadoopProtocolJobStatusTask.class.getName(), args); if (status == null) throw new IOException("Job tracker doesn't have any information about the job: " + jobId); return processStatus(status); } catch (GridClientException e) { throw new IOException("Failed to get job status: " + jobId, e); } }
From source file:org.apache.ignite.client.hadoop.GridHadoopClientProtocol.java
License:Apache License
/** {@inheritDoc} */ @Override/*from w w w . j av a 2 s.co m*/ public Counters getJobCounters(JobID jobId) throws IOException, InterruptedException { try { final GridHadoopCounters counters = cli.compute().execute( GridHadoopProtocolJobCountersTask.class.getName(), new GridHadoopProtocolTaskArguments(jobId.getJtIdentifier(), jobId.getId())); if (counters == null) throw new IOException("Job tracker doesn't have any information about the job: " + jobId); return new GridHadoopClientCounters(counters); } catch (GridClientException e) { throw new IOException("Failed to get job counters: " + jobId, e); } }
From source file:org.apache.ignite.internal.processors.hadoop.impl.proto.HadoopClientProtocol.java
License:Apache License
/** {@inheritDoc} */ @Override//from w w w.j a v a 2 s . c o m public JobStatus submitJob(JobID jobId, String jobSubmitDir, Credentials ts) throws IOException, InterruptedException { try { conf.setLong(HadoopCommonUtils.JOB_SUBMISSION_START_TS_PROPERTY, U.currentTimeMillis()); HadoopJobStatus status = execute(HadoopProtocolSubmitJobTask.class, jobId.getJtIdentifier(), jobId.getId(), createJobInfo(conf)); if (status == null) throw new IOException("Failed to submit job (null status obtained): " + jobId); return processStatus(status); } catch (GridClientException | IgniteCheckedException e) { throw new IOException("Failed to submit job.", e); } }
From source file:org.apache.ignite.internal.processors.hadoop.impl.proto.HadoopClientProtocol.java
License:Apache License
/** {@inheritDoc} */ @Override//from w ww .j a v a 2s . com public void killJob(JobID jobId) throws IOException, InterruptedException { try { execute(HadoopProtocolKillJobTask.class, jobId.getJtIdentifier(), jobId.getId()); } catch (GridClientException e) { throw new IOException("Failed to kill job: " + jobId, e); } }
From source file:org.apache.ignite.internal.processors.hadoop.impl.proto.HadoopClientProtocol.java
License:Apache License
/** {@inheritDoc} */ @Override//from ww w. ja v a 2s .c om public JobStatus getJobStatus(JobID jobId) throws IOException, InterruptedException { try { Long delay = conf.getLong(HadoopJobProperty.JOB_STATUS_POLL_DELAY.propertyName(), -1); HadoopJobStatus status; if (delay >= 0) status = execute(HadoopProtocolJobStatusTask.class, jobId.getJtIdentifier(), jobId.getId(), delay); else status = execute(HadoopProtocolJobStatusTask.class, jobId.getJtIdentifier(), jobId.getId()); if (status == null) throw new IOException("Job tracker doesn't have any information about the job: " + jobId); return processStatus(status); } catch (GridClientException e) { throw new IOException("Failed to get job status: " + jobId, e); } }