List of usage examples for org.apache.hadoop.mapreduce.task TaskAttemptContextImpl TaskAttemptContextImpl
public TaskAttemptContextImpl(Configuration conf, TaskAttemptID taskId)
From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param job the job to sample/*from ww w . ja v a2 s.c o m*/ * @param partFile where to write the output file to * @throws Throwable if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } @Override public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:com.cloudera.recordservice.mapreduce.MapReduceTest.java
License:Apache License
@Test // TODO: make this generic. This should be extensible to test all the input // formats we support. How do we do this? public void testReadNation() throws IOException, InterruptedException { Configuration config = new Configuration(); RecordServiceInputFormat.RecordServiceRecordReader reader = new RecordServiceInputFormat.RecordServiceRecordReader(); try {//from w ww. j a va 2s . co m RecordServiceConfig.setInputTable(config, null, "tpch.nation"); List<InputSplit> splits = PlanUtil.getSplits(config, new Credentials()).splits; reader.initialize(splits.get(0), new TaskAttemptContextImpl(new JobConf(config), new TaskAttemptID())); int numRows = 0; while (reader.nextKeyValue()) { RecordServiceRecord value = reader.getCurrentValue(); ++numRows; if (numRows == 10) { assertEquals("INDONESIA", value.getColumnValue(1).toString()); } } assertFalse(reader.nextKeyValue()); assertFalse(reader.nextRecord()); assertEquals(25, numRows); config.clear(); RecordServiceConfig.setInputTable(config, "tpch", "nation", "n_comment"); splits = PlanUtil.getSplits(config, new Credentials()).splits; reader.initialize(splits.get(0), new TaskAttemptContextImpl(new JobConf(config), new TaskAttemptID())); numRows = 0; while (reader.nextKeyValue()) { RecordServiceRecord value = reader.getCurrentValue(); if (numRows == 12) { assertEquals("ously. final, express gifts cajole a", value.getColumnValue(0).toString()); } ++numRows; } assertEquals(25, numRows); } finally { reader.close(); } }
From source file:com.cloudera.recordservice.mapreduce.MapReduceTest.java
License:Apache License
@Test public void testReadAllTypes() throws IOException, InterruptedException { Configuration config = new Configuration(); RecordServiceInputFormat.RecordServiceRecordReader reader = new RecordServiceInputFormat.RecordServiceRecordReader(); SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); format.setTimeZone(TimeZone.getTimeZone("GMT")); try {// w w w .j a va 2s .c om RecordServiceConfig.setInputTable(config, null, "rs.alltypes"); List<InputSplit> splits = PlanUtil.getSplits(config, new Credentials()).splits; int numRows = 0; for (InputSplit split : splits) { reader.initialize(split, new TaskAttemptContextImpl(new JobConf(config), new TaskAttemptID())); while (reader.nextKeyValue()) { RecordServiceRecord value = reader.getCurrentValue(); if (((BooleanWritable) value.getColumnValue(0)).get()) { assertEquals(0, ((ByteWritable) value.getColumnValue(1)).get()); assertEquals(1, ((ShortWritable) value.getColumnValue(2)).get()); assertEquals(2, ((IntWritable) value.getColumnValue(3)).get()); assertEquals(3, ((LongWritable) value.getColumnValue(4)).get()); assertEquals(4.0, ((FloatWritable) value.getColumnValue(5)).get(), 0.1); assertEquals(5.0, ((DoubleWritable) value.getColumnValue(6)).get(), 0.1); assertEquals("hello", value.getColumnValue(7).toString()); assertEquals("vchar1", value.getColumnValue(8).toString()); assertEquals("char1", value.getColumnValue(9).toString()); assertEquals("2015-01-01", format .format(((TimestampNanosWritable) value.getColumnValue(10)).get().toTimeStamp())); assertEquals(new BigDecimal("3.1415920000"), ((DecimalWritable) value.getColumnValue(11)).get().toBigDecimal()); } else { assertEquals(6, ((ByteWritable) value.getColumnValue(1)).get()); assertEquals(7, ((ShortWritable) value.getColumnValue(2)).get()); assertEquals(8, ((IntWritable) value.getColumnValue(3)).get()); assertEquals(9, ((LongWritable) value.getColumnValue(4)).get()); assertEquals(10.0, ((FloatWritable) value.getColumnValue(5)).get(), 0.1); assertEquals(11.0, ((DoubleWritable) value.getColumnValue(6)).get(), 0.1); assertEquals("world", value.getColumnValue(7).toString()); assertEquals("vchar2", value.getColumnValue(8).toString()); assertEquals("char2", value.getColumnValue(9).toString()); assertEquals("2016-01-01", format .format(((TimestampNanosWritable) value.getColumnValue(10)).get().toTimeStamp())); assertEquals(new BigDecimal("1234.5678900000"), ((DecimalWritable) value.getColumnValue(11)).get().toBigDecimal()); } ++numRows; } } assertEquals(2, numRows); } finally { reader.close(); } }
From source file:com.cloudera.recordservice.mapreduce.MapReduceTest.java
License:Apache License
@Test public void testReadAllTypesNull() throws IOException, InterruptedException { Configuration config = new Configuration(); RecordServiceInputFormat.RecordServiceRecordReader reader = new RecordServiceInputFormat.RecordServiceRecordReader(); try {/* ww w . j a va 2 s .c o m*/ RecordServiceConfig.setInputTable(config, null, "rs.alltypes_null"); List<InputSplit> splits = PlanUtil.getSplits(config, new Credentials()).splits; int numRows = 0; for (InputSplit split : splits) { reader.initialize(split, new TaskAttemptContextImpl(new JobConf(config), new TaskAttemptID())); while (reader.nextKeyValue()) { RecordServiceRecord value = reader.getCurrentValue(); for (int i = 0; i < value.getSchema().getNumColumns(); ++i) { assertTrue(value.getColumnValue(i) == null); } ++numRows; } } assertEquals(1, numRows); } finally { reader.close(); } }
From source file:com.cloudera.recordservice.mapreduce.MapReduceTest.java
License:Apache License
@Test public void testCountStar() throws IOException, InterruptedException { Configuration config = new Configuration(); TextInputFormat.TextRecordReader reader = new TextInputFormat.TextRecordReader(); try {/* www .j ava 2s. c o m*/ RecordServiceConfig.setInputQuery(config, "select count(*) from tpch.nation"); List<InputSplit> splits = PlanUtil.getSplits(config, new Credentials()).splits; int numRows = 0; for (InputSplit split : splits) { reader.initialize(split, new TaskAttemptContextImpl(new JobConf(config), new TaskAttemptID())); while (reader.nextKeyValue()) { ++numRows; } } assertEquals(25, numRows); } finally { reader.close(); } }
From source file:com.hadoop.mapreduce.TestLzoTextInputFormat.java
License:Open Source License
/** * Generate random data, compress it, index and md5 hash the data. * Then read it all back and md5 that too, to verify that it all went ok. * /* w w w .ja v a 2 s. co m*/ * @param testWithIndex Should we index or not? * @param charsToOutput How many characters of random data should we output. * @throws IOException * @throws NoSuchAlgorithmException * @throws InterruptedException */ private void runTest(boolean testWithIndex, int charsToOutput) throws IOException, NoSuchAlgorithmException, InterruptedException { if (!GPLNativeCodeLoader.isNativeCodeLoaded()) { LOG.warn("Cannot run this test without the native lzo libraries"); return; } Configuration conf = new Configuration(); conf.setLong("fs.local.block.size", charsToOutput / 2); // reducing block size to force a split of the tiny file conf.set("io.compression.codecs", LzopCodec.class.getName()); FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(outputDir, true); localFs.mkdirs(outputDir); Job job = new Job(conf); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class); TextOutputFormat.setOutputPath(job, outputDir); TaskAttemptContext attemptContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID("123", 0, TaskType.REDUCE, 1, 2)); // create some input data byte[] expectedMd5 = createTestInput(outputDir, localFs, attemptContext, charsToOutput); if (testWithIndex) { Path lzoFile = new Path(outputDir, lzoFileName); LzoTextInputFormat.createIndex(localFs, lzoFile); } LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, outputDir); List<InputSplit> is = inputFormat.getSplits(job); //verify we have the right number of lzo chunks if (testWithIndex && OUTPUT_BIG == charsToOutput) { assertEquals(3, is.size()); } else { assertEquals(1, is.size()); } // let's read it all and calculate the md5 hash for (InputSplit inputSplit : is) { RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext); rr.initialize(inputSplit, attemptContext); while (rr.nextKeyValue()) { Text value = rr.getCurrentValue(); md5.update(value.getBytes(), 0, value.getLength()); } rr.close(); } localFs.close(); assertTrue(Arrays.equals(expectedMd5, md5.digest())); }
From source file:com.knewton.mapreduce.SSTableColumnRecordReaderTest.java
License:Apache License
private TaskAttemptContext getTaskAttemptContext() { conf.set(PropertyConstants.COLUMN_COMPARATOR.txt, LongType.class.getName()); conf.set(PropertyConstants.PARTITIONER.txt, RandomPartitioner.class.getName()); return new TaskAttemptContextImpl(conf, attemptId); }
From source file:com.knewton.mapreduce.SSTableRecordReaderTest.java
License:Apache License
private TaskAttemptContext getTaskAttemptContext(boolean setColumnComparator, boolean setPartitioner, boolean setSubComparator) throws Exception { if (setColumnComparator) { SSTableInputFormat.setComparatorClass(LongType.class.getName(), job); }/* w w w.j a va2 s . c o m*/ if (setPartitioner) { SSTableInputFormat.setPartitionerClass(RandomPartitioner.class.getName(), job); } if (setSubComparator) { SSTableInputFormat.setSubComparatorClass(LongType.class.getName(), job); } return new TaskAttemptContextImpl(conf, attemptId); }
From source file:com.marcolotz.lung.debug.InputTester.java
License:Creative Commons License
/*** * Method used for local testing the record reader and the Input format. It * generates an input split from the local file system file. * //from w w w.j a v a 2 s .c om * @param filePath */ public void localTest(String filePath) { DICOM image; Configuration testConf = new Configuration(false); /* Reads the local file system */ testConf.set("fs.default.name", "file:///"); File testFile = new File(filePath); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat<NullWritable, BytesWritable> inputFormat = ReflectionUtils .newInstance(WholeFileInputFormat.class, testConf); TaskAttemptContext context = new TaskAttemptContextImpl(testConf, new TaskAttemptID()); try { RecordReader<NullWritable, BytesWritable> reader = inputFormat.createRecordReader(split, context); while (reader.nextKeyValue()) { /* get the bytes array */ BytesWritable inputBytesWritable = (BytesWritable) reader.getCurrentValue(); byte[] inputContent = inputBytesWritable.getBytes(); /* Check for Correct value */ // generateLocalOutput("path/to/output"); InputStream is = new ByteArrayInputStream(inputContent); image = new DICOM(is); image.run("Dicom Test"); /* Prints the bytes as an ImagePlus image */ ImageViewer debug = new ImageViewer(); debug.setImage(image); } } catch (Exception e) { } }
From source file:com.moz.fiji.mapreduce.platform.CDH5FijiMRBridge.java
License:Apache License
/** {@inheritDoc} */ @Override/*from ww w . j ava2 s .co m*/ public TaskAttemptContext newTaskAttemptContext(Configuration conf, TaskAttemptID id) { // In CDH4+, TaskAttemptContext and its implementation are separated. return new TaskAttemptContextImpl(conf, id); }