List of usage examples for org.apache.hadoop.mapreduce.task TaskAttemptContextImpl TaskAttemptContextImpl
public TaskAttemptContextImpl(Configuration conf, TaskAttemptID taskId)
From source file:com.netflix.bdp.s3.TestS3MultipartOutputCommitter.java
License:Apache License
@Before public void setupCommitter() throws Exception { getConfiguration().set("s3.multipart.committer.num-threads", String.valueOf(numThreads)); getConfiguration().set(UPLOAD_UUID, UUID.randomUUID().toString()); this.job = new JobContextImpl(getConfiguration(), JOB_ID); this.jobCommitter = new MockedS3Committer(S3_OUTPUT_PATH, job); jobCommitter.setupJob(job);/*from w w w.java 2s. co m*/ this.uuid = job.getConfiguration().get(UPLOAD_UUID); this.tac = new TaskAttemptContextImpl(new Configuration(job.getConfiguration()), AID); // get the task's configuration copy so modifications take effect this.conf = tac.getConfiguration(); conf.set("mapred.local.dir", "/tmp/local-0,/tmp/local-1"); conf.setInt(UPLOAD_SIZE, 100); this.committer = new MockedS3Committer(S3_OUTPUT_PATH, tac); }
From source file:com.netflix.bdp.s3.TestS3MultipartOutputCommitter.java
License:Apache License
private static Set<String> runTasks(JobContext job, int numTasks, int numFiles) throws IOException { Set<String> uploads = Sets.newHashSet(); for (int taskId = 0; taskId < numTasks; taskId += 1) { TaskAttemptID attemptID = new TaskAttemptID(new TaskID(JOB_ID, TaskType.REDUCE, taskId), (taskId * 37) % numTasks); TaskAttemptContext attempt = new TaskAttemptContextImpl(new Configuration(job.getConfiguration()), attemptID);//from w w w .j av a 2s . c om MockedS3Committer taskCommitter = new MockedS3Committer(S3_OUTPUT_PATH, attempt); commitTask(taskCommitter, attempt, numFiles); uploads.addAll(taskCommitter.results.getUploads()); } return uploads; }
From source file:com.phantom.hadoop.examples.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * /*from ww w . ja va2 s . c o m*/ * @param job * the job to sample * @param partFile * where to write the output file to * @throws Throwable * if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.HadoopVersionSpecificCode_CDH4.java
License:Apache License
@Override public TaskAttemptContext createTaskAttemptContext(Configuration configuration, TaskAttemptID id) { return new TaskAttemptContextImpl(configuration, id); }
From source file:com.splicemachine.derby.impl.io.WholeTextInputFormatTest.java
License:Apache License
@Test public void testGetsStreamForDirectory() throws Exception { /*/*from ww w. j a va 2 s. c o m*/ * This test failed before changes to WholeTextInputFormat(hooray for test-driven development!), * so this constitutes an effective regression test for SPLICE-739. Of course, we'll be certain * about it by ALSO writing an IT, but this is a nice little Unit test of the same thing. */ Configuration configuration = HConfiguration.unwrapDelegate(); String dirPath = SpliceUnitTest.getResourceDirectory() + "multiLineDirectory"; configuration.set("mapred.input.dir", dirPath); WholeTextInputFormat wtif = new WholeTextInputFormat(); wtif.setConf(configuration); JobContext ctx = new JobContextImpl(configuration, new JobID("test", 1)); List<InputSplit> splits = wtif.getSplits(ctx); int i = 0; Set<String> files = readFileNames(dirPath); Assert.assertEquals("We didn't get a split per file", files.size(), splits.size()); Set<String> readFiles = new HashSet<>(); long totalRecords = 0; for (InputSplit is : splits) { TaskAttemptContext tac = new TaskAttemptContextImpl(configuration, new TaskAttemptID("test", 1, true, i, 1)); RecordReader<String, InputStream> recordReader = wtif.createRecordReader(is, tac); CombineFileSplit cfs = (CombineFileSplit) is; System.out.println(cfs); totalRecords += collectRecords(readFiles, recordReader); i++; } Assert.assertEquals("did not read all data!", 28, totalRecords); Assert.assertEquals("Did not read all files!", files.size(), readFiles.size()); for (String expectedFile : files) { Assert.assertTrue("Did not read file <" + expectedFile + "> read =" + readFiles + " exp", readFiles.contains(expectedFile)); } }
From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java
License:Apache License
private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize) throws IOException, InterruptedException { TextInputFormat textInputFormat = new TextInputFormat(); InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), null); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0")); RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit, taskAttemptContext);/*w ww .j a v a 2 s .co m*/ recordReader.initialize(fileSplit, taskAttemptContext); boolean hasNext = recordReader.nextKeyValue(); List<Map.Entry> batch = new ArrayList<>(); while (hasNext && batch.size() < batchSize) { batch.add(new Pair(fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(), String.valueOf(recordReader.getCurrentValue()))); hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances } return batch; }
From source file:com.toshiba.mwcloud.gs.hadoop.mapred.GSRowRecordWriter.java
License:Apache License
/** * <div lang="ja">/*from w w w . ja va2 s . c om*/ * * @param confConfiguration * @throws IOExceptionGridDB?????? * </div><div lang="en"> * Constructor * @param conf Configuration object * @throws IOException an exception occurred in GridDB * </div> */ public GSRowRecordWriter(JobConf conf) throws IOException { TaskAttemptContext context = new TaskAttemptContextImpl(conf, TaskAttemptID.forName(conf.get("mapred.task.id"))); writer_ = new GDRecordWriter(context); }
From source file:cz.seznam.euphoria.hadoop.HadoopUtils.java
License:Apache License
public static TaskAttemptContext createTaskContext(Configuration conf, int taskNumber) { // TODO uses some default hard-coded values TaskAttemptID taskAttemptID = new TaskAttemptID("0", // job tracker ID 0, // job number, TaskType.REDUCE, // task type, taskNumber, // task ID 0); // task attempt return new TaskAttemptContextImpl(conf, taskAttemptID); }
From source file:edu.uci.ics.hyracks.hdfs.ContextFactory.java
License:Apache License
public TaskAttemptContext createContext(Configuration conf, TaskAttemptID tid) throws HyracksDataException { try {//from ww w. j a va 2s .co m return new TaskAttemptContextImpl(conf, tid); } catch (Exception e) { throw new HyracksDataException(e); } }
From source file:edu.uci.ics.hyracks.hdfs.ContextFactory.java
License:Apache License
public TaskAttemptContext createContext(Configuration conf, int partition) throws HyracksDataException { try {/* w ww .j a va2 s . c o m*/ TaskAttemptID tid = new TaskAttemptID("", 0, TaskType.REDUCE, partition, 0); return new TaskAttemptContextImpl(conf, tid); } catch (Exception e) { throw new HyracksDataException(e); } }