List of usage examples for org.apache.hadoop.mapreduce TaskAttemptID TaskAttemptID
public TaskAttemptID(TaskID taskId, int id)
From source file:com.splout.db.hadoop.TupleSampler.java
License:Apache License
/** * Random sampling method a-la-TeraSort, getting some consecutive samples from each InputSplit * without using a Job.//from www.ja v a2 s . c om * The output is SequenceFile with keys. * * @return The number of retrieved samples */ private long randomSampling(long sampleSize, Configuration hadoopConf, Path outFile, List<InputSplit> splits, Map<InputSplit, TableSpec> splitToTableSpec, Map<InputSplit, InputFormat<ITuple, NullWritable>> splitToFormat, Map<InputSplit, Map<String, String>> specificHadoopConf, Map<InputSplit, RecordProcessor> recordProcessorPerSplit, Map<InputSplit, JavascriptEngine> splitToJsEngine, int maxSplitsToVisit) throws IOException { // Instantiate the writer we will write samples to FileSystem fs = FileSystem.get(outFile.toUri(), hadoopConf); if (splits.size() == 0) { throw new IllegalArgumentException("There are no splits to sample from!"); } @SuppressWarnings("deprecation") SequenceFile.Writer writer = new SequenceFile.Writer(fs, hadoopConf, outFile, Text.class, NullWritable.class); logger.info("Sequential sampling options, max splits to visit: " + maxSplitsToVisit + ", samples to take: " + sampleSize + ", total number of splits: " + splits.size()); int blocks = Math.min(maxSplitsToVisit, splits.size()); blocks = Math.min((int) sampleSize, blocks); long recordsPerSample = sampleSize / blocks; int sampleStep = splits.size() / blocks; long records = 0; CounterInterface counterInterface = new CounterInterface(null) { public Counter getCounter(String group, String name) { return Mockito.mock(Counter.class); } ; }; // Take N samples from different parts of the input for (int i = 0; i < blocks; ++i) { TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext = null; try { attemptContext = TaskAttemptContextFactory.get(hadoopConf, attemptId); } catch (Exception e) { throw new RuntimeException(e); } InputSplit split = splits.get(sampleStep * i); if (specificHadoopConf.get(split) != null) { for (Map.Entry<String, String> specificConf : specificHadoopConf.get(split).entrySet()) { attemptContext.getConfiguration().set(specificConf.getKey(), specificConf.getValue()); } } logger.info("Sampling split: " + split); RecordReader<ITuple, NullWritable> reader = null; try { reader = splitToFormat.get(split).createRecordReader(split, attemptContext); reader.initialize(split, attemptContext); RecordProcessor processor = recordProcessorPerSplit.get(split); Text key = new Text(); while (reader.nextKeyValue()) { // ITuple tuple = reader.getCurrentKey(); ITuple uTuple; try { uTuple = processor.process(tuple, tuple.getSchema().getName(), counterInterface); } catch (Throwable e) { throw new RuntimeException(e); } if (uTuple != null) { // user may have filtered the record try { key.set(TablespaceGenerator.getPartitionByKey(uTuple, splitToTableSpec.get(split), splitToJsEngine.get(split))); } catch (Throwable e) { throw new RuntimeException("Error when determining partition key.", e); } writer.append(key, NullWritable.get()); records += 1; if ((i + 1) * recordsPerSample <= records) { break; } } } } catch (InterruptedException e) { throw new RuntimeException(e); } } writer.close(); return records; }
From source file:com.twitter.hraven.hadoopJobMonitor.AppStatusCheckerTest.java
License:Apache License
public AppStatusCheckerTest() throws ConfigurationAccessException, RestException, SAXException, IOException, ParserConfigurationException, YarnException { appId = new MyApplicationId(); appId.setId(oldJobId.getId());// ww w . ja v a 2s .c o m appId.setClusterTimestamp(Long.parseLong(oldJobId.getJtIdentifier())); taskId = new TaskID(oldJobId, TaskType.MAP, 0); taskAttemptId = new TaskAttemptID(taskId, 0); vConf.setFloat(HadoopJobMonitorConfiguration.TASK_PROGRESS_THRESHOLD, 0.2f); vConf.getInt(HadoopJobMonitorConfiguration.MAX_CACHED_TASK_PROGRESSES, 10); vConf.getInt(HadoopJobMonitorConfiguration.MAX_CACHED_APP_CONFS, 10); AppConfCache.init(vConf); ProgressCache.init(vConf); HadoopJobMonitorMetrics.initSingleton(vConf); taskProgressCache = ProgressCache.getTaskProgressCache(); attemptProgressCache = ProgressCache.getAttemptProgressCache(); when(clientCache.getClient(any(JobID.class))).thenReturn(clientService); appReport = mock(ApplicationReport.class); when(appReport.getApplicationId()).thenReturn(appId); appStatusChecker = new AppStatusChecker(vConf, appReport, clientCache, rm, new AppCheckerProgress() { @Override public void finished() { } }); mockStatic(RestClient.class); restClient = mock(RestClient.class); when(RestClient.getInstance()).thenReturn(restClient); }
From source file:io.ssc.trackthetrackers.extraction.hadoop.util.Compaction.java
License:Open Source License
public static void main(String[] args) throws IOException, InterruptedException { if (args.length != 2) { System.out.println("Usage: <input folder> <output file>"); System.exit(-1);//w w w .j av a2s . c om } String inputPath = args[0]; String outputFile = args[1]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus[] input = fs.listStatus(new Path(inputPath), new PathFilter() { @Override public boolean accept(Path path) { return path.toString().endsWith(".parquet"); } }); Path output = new Path(outputFile); fs.delete(output, true); ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder> inputFormat = new ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder>(); inputFormat.setReadSupportClass(new JobConf(conf), ProtoReadSupport.class); Job job = new Job(conf); ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage> outputFormat = new ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage>( ParsedPageProtos.ParsedPage.class); ProtoParquetOutputFormat.setProtobufClass(job, ParsedPageProtos.ParsedPage.class); ProtoParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ProtoParquetOutputFormat.setEnableDictionary(job, true); RecordWriter<Void, ParsedPageProtos.ParsedPage> recordWriter = outputFormat.getRecordWriter(conf, output, CompressionCodecName.SNAPPY); List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>(); for (FileStatus fileStatus : input) { System.out.println(fileStatus.getPath().toString()); splits.addAll(inputFormat.getSplits(conf, ParquetFileReader.readFooters(conf, fileStatus))); } int splitIndex = 0; for (ParquetInputSplit split : splits) { System.out.println("Processing split: " + split.getPath().toString() + "(" + splitIndex + " of " + splits.size() + ")"); TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", splitIndex, true, splitIndex), splitIndex); TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID); RecordReader<Void, ParsedPageProtos.ParsedPageOrBuilder> reader = inputFormat.createRecordReader(split, ctx); reader.initialize(split, ctx); while (reader.nextKeyValue()) { ParsedPageProtos.ParsedPageOrBuilder record = reader.getCurrentValue(); ParsedPageProtos.ParsedPage.Builder builder = ParsedPageProtos.ParsedPage.newBuilder(); builder.setUrl(record.getUrl()); builder.setArchiveTime(record.getArchiveTime()); builder.addAllScripts(record.getScriptsList()); builder.addAllIframes(record.getIframesList()); builder.addAllLinks(record.getLinksList()); builder.addAllImages(record.getImagesList()); recordWriter.write(null, builder.build()); } if (reader != null) { reader.close(); } splitIndex++; } TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", 1, true, 1), 1); TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID); if (recordWriter != null) { recordWriter.close(ctx); } }
From source file:it.crs4.pydoop.mapreduce.pipes.TestPipeApplication.java
License:Apache License
/** * test PipesMapRunner test the transfer data from reader * * @throws Exception/*from w w w. ja v a2s . c o m*/ */ @Test public void testRunner() throws Exception { // clean old password files File[] psw = cleanTokenPasswordFile(); try { JobID jobId = new JobID("201408272347", 0); TaskID taskId = new TaskID(jobId, TaskType.MAP, 0); TaskAttemptID taskAttemptid = new TaskAttemptID(taskId, 0); Job job = new Job(new Configuration()); job.setJobID(jobId); Configuration conf = job.getConfiguration(); conf.set(Submitter.IS_JAVA_RR, "true"); conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskAttemptid.toString()); job.setInputFormatClass(DummyInputFormat.class); FileSystem fs = new RawLocalFileSystem(); fs.setConf(conf); DummyInputFormat input_format = new DummyInputFormat(); List<InputSplit> isplits = input_format.getSplits(job); InputSplit isplit = isplits.get(0); TaskAttemptContextImpl tcontext = new TaskAttemptContextImpl(conf, taskAttemptid); RecordReader<FloatWritable, NullWritable> rReader = input_format.createRecordReader(isplit, tcontext); TestMapContext context = new TestMapContext(conf, taskAttemptid, rReader, null, null, null, isplit); // stub for client File fCommand = getFileCommand("it.crs4.pydoop.mapreduce.pipes.PipeApplicationRunnableStub"); conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath()); // token for authorization Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>("user".getBytes(), "password".getBytes(), new Text("kind"), new Text("service")); TokenCache.setJobToken(token, job.getCredentials()); conf.setBoolean(MRJobConfig.SKIP_RECORDS, true); PipesMapper<FloatWritable, NullWritable, IntWritable, Text> mapper = new PipesMapper<FloatWritable, NullWritable, IntWritable, Text>( context); initStdOut(conf); mapper.run(context); String stdOut = readStdOut(conf); // test part of translated data. As common file for client and test - // clients stdOut // check version assertTrue(stdOut.contains("CURRENT_PROTOCOL_VERSION:0")); // check key and value classes assertTrue(stdOut.contains("Key class:org.apache.hadoop.io.FloatWritable")); assertTrue(stdOut.contains("Value class:org.apache.hadoop.io.NullWritable")); // test have sent all data from reader assertTrue(stdOut.contains("value:0.0")); assertTrue(stdOut.contains("value:9.0")); } finally { if (psw != null) { // remove password files for (File file : psw) { file.deleteOnExit(); } } } }
From source file:it.crs4.pydoop.mapreduce.pipes.TestPipeApplication.java
License:Apache License
/** * test org.apache.hadoop.mapreduce.pipes.Application * test a internal functions: /*w w w. j ava 2s. c o m*/ * MessageType.REGISTER_COUNTER, INCREMENT_COUNTER, STATUS, PROGRESS... * * @throws Throwable */ @Test public void testApplication() throws Throwable { System.err.println("testApplication"); File[] psw = cleanTokenPasswordFile(); try { JobID jobId = new JobID("201408272347", 0); TaskID taskId = new TaskID(jobId, TaskType.MAP, 0); TaskAttemptID taskAttemptid = new TaskAttemptID(taskId, 0); Job job = new Job(new Configuration()); job.setJobID(jobId); Configuration conf = job.getConfiguration(); conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskAttemptid.toString()); FileSystem fs = new RawLocalFileSystem(); fs.setConf(conf); File fCommand = getFileCommand("it.crs4.pydoop.mapreduce.pipes.PipeApplicationStub"); //getFileCommand("it.crs4.pydoop.mapreduce.pipes.PipeApplicationRunnableStub"); conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath()); System.err.println("fCommand" + fCommand.getAbsolutePath()); Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>("user".getBytes(), "password".getBytes(), new Text("kind"), new Text("service")); TokenCache.setJobToken(token, job.getCredentials()); conf.setBoolean(MRJobConfig.SKIP_RECORDS, true); TestReporter reporter = new TestReporter(); DummyInputFormat input_format = new DummyInputFormat(); List<InputSplit> isplits = input_format.getSplits(job); InputSplit isplit = isplits.get(0); TaskAttemptContextImpl tcontext = new TaskAttemptContextImpl(conf, taskAttemptid); DummyRecordReader reader = (DummyRecordReader) input_format.createRecordReader(isplit, tcontext); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); RecordWriter<IntWritable, Text> writer = new TestRecordWriter( new FileOutputStream(workSpace.getAbsolutePath() + File.separator + "outfile")); MapContextImpl<IntWritable, Text, IntWritable, Text> context = new MapContextImpl<IntWritable, Text, IntWritable, Text>( conf, taskAttemptid, null, writer, null, reporter, null); System.err.println("ready to launch application"); Application<IntWritable, Text, IntWritable, Text> application = new Application<IntWritable, Text, IntWritable, Text>( context, reader); System.err.println("done"); application.getDownlink().flush(); application.getDownlink().mapItem(new IntWritable(3), new Text("txt")); application.getDownlink().flush(); application.waitForFinish(); // test getDownlink().mapItem(); String stdOut = readStdOut(conf); assertTrue(stdOut.contains("key:3")); assertTrue(stdOut.contains("value:txt")); assertEquals(0.0, context.getProgress(), 0.01); assertNotNull(context.getCounter("group", "name")); // test status MessageType.STATUS assertEquals(context.getStatus(), "PROGRESS"); // check MessageType.PROGRESS assertEquals(0.55f, reader.getProgress(), 0.001); application.getDownlink().close(); // test MessageType.OUTPUT stdOut = readFile(new File(workSpace.getAbsolutePath() + File.separator + "outfile")); assertTrue(stdOut.contains("key:123")); assertTrue(stdOut.contains("value:value")); try { // try to abort application.abort(new Throwable()); fail(); } catch (IOException e) { // abort works ? assertEquals("pipe child exception", e.getMessage()); } } finally { if (psw != null) { // remove password files for (File file : psw) { file.deleteOnExit(); } } } }
From source file:it.crs4.pydoop.mapreduce.pipes.TestPipeApplication.java
License:Apache License
/** * test org.apache.hadoop.mapreduce.pipes.PipesReducer * test the transfer of data: key and value * * @throws Exception/*from w ww . j a v a 2 s . com*/ */ @Test public void testPipesReducer() throws Exception { System.err.println("testPipesReducer"); File[] psw = cleanTokenPasswordFile(); try { JobID jobId = new JobID("201408272347", 0); TaskID taskId = new TaskID(jobId, TaskType.MAP, 0); TaskAttemptID taskAttemptid = new TaskAttemptID(taskId, 0); Job job = new Job(new Configuration()); job.setJobID(jobId); Configuration conf = job.getConfiguration(); conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskAttemptid.toString()); FileSystem fs = new RawLocalFileSystem(); fs.setConf(conf); File fCommand = getFileCommand("it.crs4.pydoop.mapreduce.pipes.PipeReducerStub"); conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath()); System.err.println("fCommand" + fCommand.getAbsolutePath()); Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>("user".getBytes(), "password".getBytes(), new Text("kind"), new Text("service")); TokenCache.setJobToken(token, job.getCredentials()); conf.setBoolean(MRJobConfig.SKIP_RECORDS, true); TestReporter reporter = new TestReporter(); DummyInputFormat input_format = new DummyInputFormat(); List<InputSplit> isplits = input_format.getSplits(job); InputSplit isplit = isplits.get(0); TaskAttemptContextImpl tcontext = new TaskAttemptContextImpl(conf, taskAttemptid); RecordWriter<IntWritable, Text> writer = new TestRecordWriter( new FileOutputStream(workSpace.getAbsolutePath() + File.separator + "outfile")); BooleanWritable bw = new BooleanWritable(true); List<Text> texts = new ArrayList<Text>(); texts.add(new Text("first")); texts.add(new Text("second")); texts.add(new Text("third")); DummyRawKeyValueIterator kvit = new DummyRawKeyValueIterator(); ReduceContextImpl<BooleanWritable, Text, IntWritable, Text> context = new ReduceContextImpl<BooleanWritable, Text, IntWritable, Text>( conf, taskAttemptid, kvit, null, null, writer, null, null, null, BooleanWritable.class, Text.class); PipesReducer<BooleanWritable, Text, IntWritable, Text> reducer = new PipesReducer<BooleanWritable, Text, IntWritable, Text>(); reducer.setup(context); initStdOut(conf); reducer.reduce(bw, texts, context); reducer.cleanup(context); String stdOut = readStdOut(conf); // test data: key assertTrue(stdOut.contains("reducer key :true")); // and values assertTrue(stdOut.contains("reduce value :first")); assertTrue(stdOut.contains("reduce value :second")); assertTrue(stdOut.contains("reduce value :third")); } finally { if (psw != null) { // remove password files for (File file : psw) { file.deleteOnExit(); } } } }
From source file:it.crs4.pydoop.mapreduce.pipes.TestPipesNonJavaInputFormat.java
License:Apache License
/** * test PipesNonJavaInputFormat/*from www . jav a 2 s. c o m*/ */ @Test public void testFormat() throws IOException, InterruptedException { JobID jobId = new JobID("201408272347", 0); TaskID taskId = new TaskID(jobId, TaskType.MAP, 0); TaskAttemptID taskAttemptid = new TaskAttemptID(taskId, 0); Job job = new Job(new Configuration()); job.setJobID(jobId); Configuration conf = job.getConfiguration(); TaskAttemptContextImpl tcontext = new TaskAttemptContextImpl(conf, taskAttemptid); PipesNonJavaInputFormat input_format = new PipesNonJavaInputFormat(); DummyRecordReader reader = (DummyRecordReader) input_format.createRecordReader(new FileSplit(), tcontext); assertEquals(0.0f, reader.getProgress(), 0.001); // input and output files File input1 = new File(workSpace + File.separator + "input1"); if (!input1.getParentFile().exists()) { Assert.assertTrue(input1.getParentFile().mkdirs()); } if (!input1.exists()) { Assert.assertTrue(input1.createNewFile()); } File input2 = new File(workSpace + File.separator + "input2"); if (!input2.exists()) { Assert.assertTrue(input2.createNewFile()); } // THIS fill fail without hdfs support. // // set data for splits // conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, // StringUtils.escapeString(input1.getAbsolutePath()) + "," // + StringUtils.escapeString(input2.getAbsolutePath())); // List<InputSplit> splits = input_format.getSplits(job); // assertTrue(splits.size() >= 2); PipesNonJavaInputFormat.PipesDummyRecordReader dummyRecordReader = new PipesNonJavaInputFormat.PipesDummyRecordReader( new FileSplit(), tcontext); // empty dummyRecordReader assertEquals(0.0, dummyRecordReader.getProgress(), 0.001); // test method next assertTrue(dummyRecordReader.next(new FloatWritable(2.0f), NullWritable.get())); assertEquals(2.0, dummyRecordReader.getProgress(), 0.001); dummyRecordReader.close(); }
From source file:org.apache.beam.sdk.io.hadoop.format.HadoopFormats.java
License:Apache License
/** * Creates new setup {@link TaskAttemptContext} from hadoop {@link Configuration} and {@link * JobID}.//www . j a v a2 s . c om * * @param conf hadoop {@link Configuration} * @param jobID jobId of the created {@link TaskAttemptContext} * @return new setup {@link TaskAttemptContext} */ static TaskAttemptContext createSetupTaskContext(Configuration conf, JobID jobID) { final TaskID taskId = new TaskID(jobID, TaskType.JOB_SETUP, 0); return createTaskAttemptContext(conf, new TaskAttemptID(taskId, 0)); }
From source file:org.apache.beam.sdk.io.hadoop.format.HadoopFormats.java
License:Apache License
/** * Creates new {@link TaskAttemptID}.//from w w w .jav a 2 s . c om * * @param jobID jobId * @param taskId taskId * @param attemptId attemptId * @return new {@link TaskAttemptID} */ static TaskAttemptID createTaskAttemptID(JobID jobID, int taskId, int attemptId) { final TaskID tId = createTaskID(jobID, taskId); return new TaskAttemptID(tId, attemptId); }
From source file:org.apache.beam.sdk.io.hadoop.format.HadoopFormats.java
License:Apache License
/** * Creates cleanup {@link TaskAttemptContext} for given {@link JobID}. * * @param conf hadoop configuration/* ww w . j a v a 2s. c o m*/ * @param jobID jobId of the created {@link TaskID} * @return new cleanup {@link TaskID} for given {@link JobID} */ static TaskAttemptContext createCleanupTaskContext(Configuration conf, JobID jobID) { final TaskID taskId = new TaskID(jobID, TaskType.JOB_CLEANUP, 0); return createTaskAttemptContext(conf, new TaskAttemptID(taskId, 0)); }