List of usage examples for org.apache.hadoop.mapreduce OutputFormat getOutputCommitter
public abstract OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException;
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputsMainOutputWrapper.java
License:Apache License
@Override public synchronized OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { // return a MultipleOutputsCommitter that commits for the root output format as well as all delegate outputformats if (committer == null) { Map<String, OutputCommitter> committers = new HashMap<>(); for (String name : MultipleOutputs.getNamedOutputsList(context)) { Class<? extends OutputFormat> namedOutputFormatClass = MultipleOutputs .getNamedOutputFormatClass(context, name); TaskAttemptContext namedContext = MultipleOutputs.getNamedTaskContext(context, name); OutputFormat outputFormat = new InstantiatorFactory(false).get(TypeToken.of(namedOutputFormatClass)) .create();//from w ww . ja va 2 s . c o m committers.put(name, outputFormat.getOutputCommitter(namedContext)); } committer = new MultipleOutputsCommitter(committers); } return committer; }
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
private void runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException { assert job.getJobID() != null; TaskID taskId = newMapTaskId(job.getJobID(), 0); Configuration conf = job.getConfiguration(); OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf); OutputCommitter committer = output .getOutputCommitter(newTaskAttemptContext(conf, newTaskAttemptId(taskId, 0))); boolean succeed = false; committer.setupJob(job);/* w ww. j a v a 2 s .c o m*/ try { if (job.getNumReduceTasks() == 0) { runMap(job, null); } else { try (KeyValueSorter<?, ?> sorter = createSorter(job, job.getMapOutputKeyClass(), job.getMapOutputValueClass())) { runMap(job, sorter); runReduce(job, sorter); } } committer.commitJob(job); succeed = true; } finally { if (succeed == false) { try { committer.abortJob(job, State.FAILED); } catch (IOException e) { LOG.error(MessageFormat.format("error occurred while aborting job: {0} ({1})", job.getJobID(), job.getJobName()), e); } } } }
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" })
private void runMap(Job job, KeyValueSorter<?, ?> sorter)
throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = job.getConfiguration();
InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
List<InputSplit> splits = input.getSplits(job);
int serial = 1;
for (InputSplit split : splits) {
TaskAttemptID id = newTaskAttemptId(newMapTaskId(job.getJobID(), serial++), 0);
Mapper<?, ?, ?, ?> mapper = ReflectionUtils.newInstance(job.getMapperClass(), conf);
if (LOG.isDebugEnabled()) {
LOG.debug(MessageFormat.format("starting mapper: {0}@{1} ({2}bytes)", //$NON-NLS-1$
mapper.getClass().getName(), id, split.getLength()));
}//from w w w . j a va 2 s.c o m
TaskAttemptContext context = newTaskAttemptContext(conf, id);
// we always obtain a new OutputFormat object / OutputFormat.getOutputCommiter() may be cached
OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
OutputCommitter committer = output.getOutputCommitter(context);
committer.setupTask(context);
boolean succeed = false;
try (RecordReader<?, ?> reader = input.createRecordReader(split, newTaskAttemptContext(conf, id))) {
RecordWriter<?, ?> writer;
if (sorter != null) {
writer = new ShuffleWriter(sorter);
} else {
writer = output.getRecordWriter(newTaskAttemptContext(conf, id));
}
try {
Mapper.Context c = newMapperContext(conf, id, reader, writer, committer, split);
reader.initialize(split, c);
mapper.run(c);
} finally {
writer.close(newTaskAttemptContext(conf, id));
}
doCommitTask(context, committer);
succeed = true;
} finally {
if (succeed == false) {
doAbortTask(context, committer);
}
}
}
}
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" })
private void runReduce(Job job, KeyValueSorter<?, ?> sorter)
throws ClassNotFoundException, IOException, InterruptedException {
Configuration conf = job.getConfiguration();
OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
TaskAttemptID id = newTaskAttemptId(newReduceTaskId(job.getJobID(), 1), 0);
Reducer<?, ?, ?, ?> reducer = ReflectionUtils.newInstance(job.getReducerClass(), conf);
if (LOG.isDebugEnabled()) {
LOG.debug(MessageFormat.format("starting reducer: {0}@{1} ({2}records, {3}bytes)", //$NON-NLS-1$
reducer.getClass().getName(), id, sorter.getRecordCount(), sorter.getSizeInBytes()));
}/* ww w . j a va 2 s.c o m*/
TaskAttemptContext context = newTaskAttemptContext(conf, id);
OutputCommitter committer = output.getOutputCommitter(context);
committer.setupTask(context);
boolean succeed = false;
try {
ShuffleReader reader = new ShuffleReader(sorter, new Progress());
try {
RecordWriter<?, ?> writer = output.getRecordWriter(newTaskAttemptContext(conf, id));
try {
Reducer.Context c = newReducerContext(conf, id, reader, sorter.getKeyClass(),
sorter.getValueClass(), writer, committer, (RawComparator) job.getGroupingComparator());
reducer.run(c);
} finally {
writer.close(newTaskAttemptContext(conf, id));
}
} finally {
try {
reader.close();
} catch (IOException e) {
LOG.warn(MessageFormat.format("error occurred while reducer mapper input: {0} ({1})", id,
job.getJobName()), e);
}
}
doCommitTask(context, committer);
succeed = true;
} finally {
if (succeed == false) {
doAbortTask(context, committer);
}
}
}
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.PangoolMultipleOutputs.java
License:Apache License
public synchronized RecordWriter getRecordWriter(String baseFileName) throws IOException, InterruptedException { // Look for record-writer in the cache OutputContext context = outputContexts.get(baseFileName); // If not in cache, create a new one if (context == null) { context = new OutputContext(); OutputFormat mainOutputFormat; try {//from ww w. j av a 2 s . c o m mainOutputFormat = ((OutputFormat) ReflectionUtils.newInstance(this.context.getOutputFormatClass(), this.context.getConfiguration())); } catch (ClassNotFoundException e1) { throw new RuntimeException(e1); } ProxyOutputCommitter baseOutputCommitter = ((ProxyOutputCommitter) mainOutputFormat .getOutputCommitter(this.context)); // The trick is to create a new Job for each output Job job = new Job(this.context.getConfiguration()); job.setOutputKeyClass(getNamedOutputKeyClass(this.context, baseFileName)); job.setOutputValueClass(getNamedOutputValueClass(this.context, baseFileName)); // Check possible specific context for the output setSpecificNamedOutputContext(this.context.getConfiguration(), job, baseFileName); TaskAttemptContext taskContext; try { taskContext = TaskAttemptContextFactory.get(job.getConfiguration(), this.context.getTaskAttemptID()); } catch (Exception e) { throw new IOException(e); } // First we change the output dir for the new OutputFormat that we will // create // We put it inside the main output work path -> in case the Job fails, // everything will be discarded taskContext.getConfiguration().set("mapred.output.dir", baseOutputCommitter.getBaseDir() + "/" + baseFileName); // This is for Hadoop 2.0 : taskContext.getConfiguration().set("mapreduce.output.fileoutputformat.outputdir", baseOutputCommitter.getBaseDir() + "/" + baseFileName); context.taskAttemptContext = taskContext; // Load the OutputFormat instance OutputFormat outputFormat = InstancesDistributor.loadInstance( context.taskAttemptContext.getConfiguration(), OutputFormat.class, getNamedOutputFormatInstanceFile(this.context, baseFileName), true); // We have to create a JobContext for meeting the contract of the // OutputFormat JobContext jobContext; try { jobContext = JobContextFactory.get(taskContext.getConfiguration(), taskContext.getJobID()); } catch (Exception e) { throw new IOException(e); } context.jobContext = jobContext; // The contract of the OutputFormat is to check the output specs outputFormat.checkOutputSpecs(jobContext); // We get the output committer so we can call it later context.outputCommitter = outputFormat.getOutputCommitter(taskContext); // Save the RecordWriter to cache it context.recordWriter = outputFormat.getRecordWriter(taskContext); // if counters are enabled, wrap the writer with context // to increment counters if (countersEnabled) { context.recordWriter = new RecordWriterWithCounter(context.recordWriter, baseFileName, this.context); } outputContexts.put(baseFileName, context); } return context.recordWriter; }
From source file:com.marklogic.contentpump.LocalJobRunner.java
License:Apache License
/** * Run the job. Get the input splits, create map tasks and submit it to * the thread pool if there is one; otherwise, runs the the task one by * one./*www . j a va 2 s.c o m*/ * * @param <INKEY> * @param <INVALUE> * @param <OUTKEY> * @param <OUTVALUE> * @throws Exception */ @SuppressWarnings("unchecked") public <INKEY, INVALUE, OUTKEY, OUTVALUE, T extends org.apache.hadoop.mapreduce.InputSplit> void run() throws Exception { Configuration conf = job.getConfiguration(); InputFormat<INKEY, INVALUE> inputFormat = (InputFormat<INKEY, INVALUE>) ReflectionUtils .newInstance(job.getInputFormatClass(), conf); List<InputSplit> splits = inputFormat.getSplits(job); T[] array = (T[]) splits.toArray(new org.apache.hadoop.mapreduce.InputSplit[splits.size()]); // sort the splits into order based on size, so that the biggest // goes first Arrays.sort(array, new SplitLengthComparator()); OutputFormat<OUTKEY, OUTVALUE> outputFormat = (OutputFormat<OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(job.getOutputFormatClass(), conf); Class<? extends Mapper<?, ?, ?, ?>> mapperClass = job.getMapperClass(); Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper = (Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(mapperClass, conf); try { outputFormat.checkOutputSpecs(job); } catch (Exception ex) { if (LOG.isDebugEnabled()) { LOG.debug("Error checking output specification: ", ex); } else { LOG.error("Error checking output specification: "); LOG.error(ex.getMessage()); } return; } conf = job.getConfiguration(); progress = new AtomicInteger[splits.size()]; for (int i = 0; i < splits.size(); i++) { progress[i] = new AtomicInteger(); } Monitor monitor = new Monitor(); monitor.start(); reporter = new ContentPumpReporter(); List<Future<Object>> taskList = new ArrayList<Future<Object>>(); for (int i = 0; i < array.length; i++) { InputSplit split = array[i]; if (pool != null) { LocalMapTask<INKEY, INVALUE, OUTKEY, OUTVALUE> task = new LocalMapTask<INKEY, INVALUE, OUTKEY, OUTVALUE>( inputFormat, outputFormat, conf, i, split, reporter, progress[i]); availableThreads = assignThreads(i, array.length); Class<? extends Mapper<?, ?, ?, ?>> runtimeMapperClass = job.getMapperClass(); if (availableThreads > 1 && availableThreads != threadsPerSplit) { // possible runtime adjustment if (runtimeMapperClass != (Class) MultithreadedMapper.class) { runtimeMapperClass = (Class<? extends Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>>) cmd .getRuntimeMapperClass(job, mapperClass, threadsPerSplit, availableThreads); } if (runtimeMapperClass != mapperClass) { task.setMapperClass(runtimeMapperClass); } if (runtimeMapperClass == (Class) MultithreadedMapper.class) { task.setThreadCount(availableThreads); if (LOG.isDebugEnabled()) { LOG.debug("Thread Count for Split#" + i + " : " + availableThreads); } } } if (runtimeMapperClass == (Class) MultithreadedMapper.class) { synchronized (pool) { taskList.add(pool.submit(task)); pool.wait(); } } else { pool.submit(task); } } else { // single-threaded JobID jid = new JobID(); TaskID taskId = new TaskID(jid.getJtIdentifier(), jid.getId(), TaskType.MAP, i); TaskAttemptID taskAttemptId = new TaskAttemptID(taskId, 0); TaskAttemptContext context = ReflectionUtil.createTaskAttemptContext(conf, taskAttemptId); RecordReader<INKEY, INVALUE> reader = inputFormat.createRecordReader(split, context); RecordWriter<OUTKEY, OUTVALUE> writer = outputFormat.getRecordWriter(context); OutputCommitter committer = outputFormat.getOutputCommitter(context); TrackingRecordReader trackingReader = new TrackingRecordReader(reader, progress[i]); Mapper.Context mapperContext = ReflectionUtil.createMapperContext(mapper, conf, taskAttemptId, trackingReader, writer, committer, reporter, split); trackingReader.initialize(split, mapperContext); // no thread pool (only 1 thread specified) Class<? extends Mapper<?, ?, ?, ?>> mapClass = job.getMapperClass(); mapperContext.getConfiguration().setClass(CONF_MAPREDUCE_JOB_MAP_CLASS, mapClass, Mapper.class); mapper = (Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils.newInstance(mapClass, mapperContext.getConfiguration()); mapper.run(mapperContext); trackingReader.close(); writer.close(mapperContext); committer.commitTask(context); } } // wait till all tasks are done if (pool != null) { for (Future<Object> f : taskList) { f.get(); } pool.shutdown(); while (!pool.awaitTermination(1, TimeUnit.DAYS)) ; jobComplete.set(true); } monitor.interrupt(); monitor.join(1000); // report counters Iterator<CounterGroup> groupIt = reporter.counters.iterator(); while (groupIt.hasNext()) { CounterGroup group = groupIt.next(); LOG.info(group.getDisplayName() + ": "); Iterator<Counter> counterIt = group.iterator(); while (counterIt.hasNext()) { Counter counter = counterIt.next(); LOG.info(counter.getDisplayName() + ": " + counter.getValue()); } } LOG.info("Total execution time: " + (System.currentTimeMillis() - startTime) / 1000 + " sec"); }
From source file:com.mortardata.pig.storage.TestDynamoDBStorage.java
License:Apache License
@Test public void testSingleRow() throws IOException, InterruptedException { // test specific constants String tableName = "mortar_test_foo_table"; String awsAccessKeyId = "XXXXXXXXXXXXX"; String awsSecretKey = "YYYYYYYYYYYYYY"; Long writeCapacityUnits = 50L; Double consumedCapacityUnits = 7.0D; String location = "s3://mortar-example-output-data/unused"; String signature = "thealias_" + location + "_com.mortardata.pig.storage.DynamoDBStorage('" + tableName + "','" + awsAccessKeyId + "','" + awsSecretKey + "')"; ResourceSchema schema = new ResourceSchema(Utils.getSchemaFromString( "my_field:int,my_float_field:float,my_str_field:chararray,my_null_field:chararray,my_empty_string_field:chararray")); String mapOrReduce = "reduce"; int numSlotsForStore = 3; int numTasksForStore = 20; String hashPrimaryKeyName = "my_field"; Collection<KeySchemaElement> keySchemaElements = Arrays .asList(new KeySchemaElement(hashPrimaryKeyName, KeyType.HASH)); Collection<AttributeDefinition> attributeDefinitions = Arrays .asList(new AttributeDefinition(hashPrimaryKeyName, ScalarAttributeType.N)); // mock dynamo client AmazonDynamoDBClient dynamo = mock(AmazonDynamoDBClient.class); DescribeTableResult describeResult = new DescribeTableResult().withTable(new TableDescription() .withProvisionedThroughput(/* ww w. j av a 2s . c om*/ new ProvisionedThroughputDescription().withWriteCapacityUnits(writeCapacityUnits)) .withKeySchema(keySchemaElements).withAttributeDefinitions(attributeDefinitions)); when(dynamo.describeTable(any(DescribeTableRequest.class))).thenReturn(describeResult); Map<String, List<WriteRequest>> unprocessedItems = Maps.newHashMap(); BatchWriteItemResult batchWriteItemResult = new BatchWriteItemResult() .withUnprocessedItems(unprocessedItems).withConsumedCapacity( new ConsumedCapacity().withTableName(tableName).withCapacityUnits(consumedCapacityUnits)); ArgumentCaptor<BatchWriteItemRequest> batchWriteItemRequestCaptor = ArgumentCaptor .forClass(BatchWriteItemRequest.class); when(dynamo.batchWriteItem(batchWriteItemRequestCaptor.capture())).thenReturn(batchWriteItemResult); // mock Hadoop interaction HadoopJobInfo hadoopJobInfo = mock(HadoopJobInfo.class); when(hadoopJobInfo.getMapOrReduce()).thenReturn(mapOrReduce); when(hadoopJobInfo.getNumSlotsForStore()).thenReturn(numSlotsForStore); when(hadoopJobInfo.getNumTasksForStore()).thenReturn(numTasksForStore); when(hadoopJobInfo.getJobConfiguration()).thenReturn(new Configuration()); // front end DynamoDBStorage storage = new DynamoDBStorage(tableName, awsAccessKeyId, awsSecretKey, dynamo, hadoopJobInfo); storage.setStoreFuncUDFContextSignature(signature); storage.checkSchema(schema); storage.setStoreLocation(location, null); Assert.assertNotNull(storage.getOutputFormat()); // simulate back end storage.setStoreFuncUDFContextSignature(signature); @SuppressWarnings("rawtypes") OutputFormat outputFormat = storage.getOutputFormat(); Assert.assertNotNull(outputFormat); storage.setStoreLocation(location, null); storage.prepareToWrite(null); Tuple tuple = TupleFactory.getInstance().newTuple(5); tuple.set(0, new Integer(3)); tuple.set(1, new Float(4.3)); tuple.set(2, "my_string_here"); tuple.set(3, null); tuple.set(4, ""); storage.putNext(tuple); outputFormat.getOutputCommitter(null).commitTask(null); // write throughput pct [default 0.5] * writeCapacityUnits / min(numSlotsForStore, numTasksForStore) Assert.assertEquals(8, storage.getMaxWriteCapacity()); // ensure that we received the item to save out List<BatchWriteItemRequest> bwrirs = batchWriteItemRequestCaptor.getAllValues(); Assert.assertEquals(1, bwrirs.size()); List<WriteRequest> writeRequests = bwrirs.get(0).getRequestItems().get(tableName); Assert.assertEquals(1, writeRequests.size()); Map<String, AttributeValue> item = writeRequests.get(0).getPutRequest().getItem(); Assert.assertEquals(new AttributeValue().withN("3"), item.get(hashPrimaryKeyName)); Assert.assertEquals(new AttributeValue().withN("4.3"), item.get("my_float_field")); Assert.assertNull(item.get("my_null_field")); Assert.assertNull(item.get("my_empty_string_field")); }
From source file:com.samsung.px.pig.storage.TestDynamoDBStorage.java
License:Apache License
@Test public void testSingleRow() throws IOException, InterruptedException { // test specific constants String tableName = "mortar_test_foo_table"; String awsAccessKeyId = "XXXXXXXXXXXXX"; String awsSecretKey = "YYYYYYYYYYYYYY"; Long writeCapacityUnits = 50L; Double consumedCapacityUnits = 7.0D; String location = "s3://mortar-example-output-data/unused"; String signature = "thealias_" + location + "_com.mortardata.pig.storage.DynamoDBStorage('" + tableName + "','" + awsAccessKeyId + "','" + awsSecretKey + "')"; ResourceSchema schema = new ResourceSchema(Utils.getSchemaFromString( "my_field:int,my_float_field:float,my_str_field:chararray,my_null_field:chararray,my_empty_string_field:chararray")); String mapOrReduce = "reduce"; int numSlotsForStore = 3; int numTasksForStore = 20; String hashPrimaryKeyName = "my_field"; // mock dynamo client AmazonDynamoDBClient dynamo = mock(AmazonDynamoDBClient.class); DescribeTableResult describeResult = new DescribeTableResult().withTable(new TableDescription() .withProvisionedThroughput(//from w ww . j a v a 2s .c o m new ProvisionedThroughputDescription().withWriteCapacityUnits(writeCapacityUnits)) .withKeySchema(new KeySchema().withHashKeyElement(new KeySchemaElement() .withAttributeName(hashPrimaryKeyName).withAttributeType(ScalarAttributeType.N)))); when(dynamo.describeTable(any(DescribeTableRequest.class))).thenReturn(describeResult); Map<String, List<WriteRequest>> unprocessedItems = Maps.newHashMap(); Map<String, BatchWriteResponse> reponses = Maps.newHashMap(); reponses.put(tableName, new BatchWriteResponse().withConsumedCapacityUnits(consumedCapacityUnits)); BatchWriteItemResult batchWriteItemResult = new BatchWriteItemResult() .withUnprocessedItems(unprocessedItems).withResponses(reponses); ArgumentCaptor<BatchWriteItemRequest> batchWriteItemRequestCaptor = ArgumentCaptor .forClass(BatchWriteItemRequest.class); when(dynamo.batchWriteItem(batchWriteItemRequestCaptor.capture())).thenReturn(batchWriteItemResult); // mock Hadoop interaction HadoopJobInfo hadoopJobInfo = mock(HadoopJobInfo.class); when(hadoopJobInfo.getMapOrReduce()).thenReturn(mapOrReduce); when(hadoopJobInfo.getNumSlotsForStore()).thenReturn(numSlotsForStore); when(hadoopJobInfo.getNumTasksForStore()).thenReturn(numTasksForStore); when(hadoopJobInfo.getJobConfiguration()).thenReturn(new Configuration()); // front end DynamoDBStorage storage = new DynamoDBStorage(tableName, awsAccessKeyId, awsSecretKey, dynamo, hadoopJobInfo); storage.setStoreFuncUDFContextSignature(signature); storage.checkSchema(schema); storage.setStoreLocation(location, null); Assert.assertNotNull(storage.getOutputFormat()); // simulate back end storage.setStoreFuncUDFContextSignature(signature); @SuppressWarnings("rawtypes") OutputFormat outputFormat = storage.getOutputFormat(); Assert.assertNotNull(outputFormat); storage.setStoreLocation(location, null); storage.prepareToWrite(null); Tuple tuple = TupleFactory.getInstance().newTuple(5); tuple.set(0, new Integer(3)); tuple.set(1, new Float(4.3)); tuple.set(2, "my_string_here"); tuple.set(3, null); tuple.set(4, ""); storage.putNext(tuple); outputFormat.getOutputCommitter(null).commitTask(null); // write throughput pct [default 0.5] * writeCapacityUnits / min(numSlotsForStore, numTasksForStore) Assert.assertEquals(8, storage.getMaxWriteCapacity()); // ensure that we received the item to save out List<BatchWriteItemRequest> bwrirs = batchWriteItemRequestCaptor.getAllValues(); Assert.assertEquals(1, bwrirs.size()); List<WriteRequest> writeRequests = bwrirs.get(0).getRequestItems().get(tableName); Assert.assertEquals(1, writeRequests.size()); Map<String, AttributeValue> item = writeRequests.get(0).getPutRequest().getItem(); Assert.assertEquals(new AttributeValue().withN("3"), item.get(hashPrimaryKeyName)); Assert.assertEquals(new AttributeValue().withN("4.3"), item.get("my_float_field")); Assert.assertNull(item.get("my_null_field")); Assert.assertNull(item.get("my_empty_string_field")); }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapreduce.java
License:Apache License
/** * Runs mapper for the single split./*w w w .j a va2 s. c o m*/ * * @param mapOutputAccumulator mapOutputAccumulator to use * @param split split ot run on */ @Override @SuppressWarnings("unchecked") public void runSplit(MapOutputAccumulator<OUTKEY, OUTVALUE> mapOutputAccumulator, Object split, int splitIndex) throws IOException, ClassNotFoundException, InterruptedException { TaskAttemptID taskAttemptId = hadoopVersionSpecificCode.createTaskAttemptId(jobId, true, splitIndex); //Setup task ID info TaskAttemptContext taskContext = hadoopVersionSpecificCode.createTaskAttemptContext(configuration, taskAttemptId); InputFormat inputFormat = ReflectionUtils.newInstance(jobContext.getInputFormatClass(), configuration); //Create RecordReader org.apache.hadoop.mapreduce.RecordReader<INKEY, INVALUE> input = inputFormat .createRecordReader((InputSplit) split, taskContext); //Make a mapper org.apache.hadoop.mapreduce.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper; try { mapper = (org.apache.hadoop.mapreduce.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) mapperConstructor .newInstance(); } catch (Exception e) { throw new RuntimeException(e); } org.apache.hadoop.mapreduce.RecordWriter output; OutputCommitter committer = null; if (mapOnlyJob) { OutputFormat outputFormat = ReflectionUtils.newInstance(jobContext.getOutputFormatClass(), configuration); output = (org.apache.hadoop.mapreduce.RecordWriter<OUTKEY, OUTVALUE>) outputFormat .getRecordWriter(taskContext); committer = outputFormat.getOutputCommitter(taskContext); committer.setupTask(taskContext); } else { output = new MapOutputCollector<OUTKEY, OUTVALUE>(mapOutputAccumulator); } input.initialize((InputSplit) split, taskContext); org.apache.hadoop.mapreduce.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>.Context mapperContext = hadoopVersionSpecificCode .getMapperContext(configuration, taskAttemptId, input, output); mapper.run(mapperContext); input.close(); output.close(mapperContext); if (mapOnlyJob && committer != null) { committer.commitTask(taskContext); } }
From source file:com.scaleoutsoftware.soss.hserver.hadoop.ReducerWrapperMapreduce.java
License:Apache License
public ReducerWrapperMapreduce(HServerInvocationParameters invocationParameters, int hadoopPartition, int appId, int region, boolean sort) throws IOException, ClassNotFoundException, InterruptedException { this.invocationParameters = invocationParameters; Configuration configuration = (Configuration) invocationParameters.getConfiguration(); hadoopVersionSpecificCode = HadoopVersionSpecificCode.getInstance(invocationParameters.getHadoopVersion(), configuration);//from w w w .jav a2 s . c om JobID jobID = (JobID) invocationParameters.getJobId(); //Setup task ID info TaskAttemptID id = hadoopVersionSpecificCode.createTaskAttemptId(jobID, false, hadoopPartition); JobContext jobContext = hadoopVersionSpecificCode.createJobContext(new JobConf(configuration), jobID); taskContext = hadoopVersionSpecificCode.createTaskAttemptContext(configuration, id); reducer = (org.apache.hadoop.mapreduce.Reducer<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(jobContext.getReducerClass(), configuration); OutputFormat outputFormat = ReflectionUtils.newInstance(jobContext.getOutputFormatClass(), configuration); recordWriter = (org.apache.hadoop.mapreduce.RecordWriter<OUTKEY, OUTVALUE>) outputFormat .getRecordWriter(taskContext); committer = outputFormat.getOutputCommitter(taskContext); committer.setupTask(taskContext); Class<INKEY> keyClass = (Class<INKEY>) jobContext.getMapOutputKeyClass(); WritableSerializerDeserializer<INKEY> firstKeySerializer = new WritableSerializerDeserializer<INKEY>( keyClass, null); WritableSerializerDeserializer<INKEY> secondKeySerializer = new WritableSerializerDeserializer<INKEY>( keyClass, null); Class<INVALUE> valueClass = (Class<INVALUE>) jobContext.getMapOutputValueClass(); WritableSerializerDeserializer<INVALUE> valueSerializer = new WritableSerializerDeserializer<INVALUE>( valueClass, null); DataGridReaderParameters<INKEY, INVALUE> params = new DataGridReaderParameters<INKEY, INVALUE>(region, appId, HServerParameters.getSetting(REDUCE_USEMEMORYMAPPEDFILES, configuration) > 0, firstKeySerializer, valueSerializer, invocationParameters.getSerializationMode(), secondKeySerializer, keyClass, valueClass, sort, HServerParameters.getSetting(REDUCE_CHUNKSTOREADAHEAD, configuration), 1024 * HServerParameters.getSetting(REDUCE_INPUTCHUNKSIZE_KB, configuration), HServerParameters.getSetting(REDUCE_CHUNKREADTIMEOUT, configuration)); DataGridChunkedCollectionReader<INKEY, INVALUE> transport = DataGridChunkedCollectionReader .getGridReader(params); context = hadoopVersionSpecificCode.getReducerContext(configuration, id, committer, recordWriter, transport, null); }