List of usage examples for org.apache.hadoop.mapreduce OutputFormat checkOutputSpecs
public abstract void checkOutputSpecs(JobContext context) throws IOException, InterruptedException;
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputsMainOutputWrapper.java
License:Apache License
@Override public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException { for (String name : MultipleOutputs.getNamedOutputsList(context)) { Class<? extends OutputFormat> namedOutputFormatClass = MultipleOutputs .getNamedOutputFormatClass(context, name); OutputFormat outputFormat = new InstantiatorFactory(false).get(TypeToken.of(namedOutputFormatClass)) .create();/*ww w . j a v a 2s. com*/ JobContext namedContext = MultipleOutputs.getNamedJobContext(context, name); outputFormat.checkOutputSpecs(namedContext); } }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.PangoolMultipleOutputs.java
License:Apache License
public synchronized RecordWriter getRecordWriter(String baseFileName) throws IOException, InterruptedException { // Look for record-writer in the cache OutputContext context = outputContexts.get(baseFileName); // If not in cache, create a new one if (context == null) { context = new OutputContext(); OutputFormat mainOutputFormat;//from w ww. ja v a2s .com try { mainOutputFormat = ((OutputFormat) ReflectionUtils.newInstance(this.context.getOutputFormatClass(), this.context.getConfiguration())); } catch (ClassNotFoundException e1) { throw new RuntimeException(e1); } ProxyOutputCommitter baseOutputCommitter = ((ProxyOutputCommitter) mainOutputFormat .getOutputCommitter(this.context)); // The trick is to create a new Job for each output Job job = new Job(this.context.getConfiguration()); job.setOutputKeyClass(getNamedOutputKeyClass(this.context, baseFileName)); job.setOutputValueClass(getNamedOutputValueClass(this.context, baseFileName)); // Check possible specific context for the output setSpecificNamedOutputContext(this.context.getConfiguration(), job, baseFileName); TaskAttemptContext taskContext; try { taskContext = TaskAttemptContextFactory.get(job.getConfiguration(), this.context.getTaskAttemptID()); } catch (Exception e) { throw new IOException(e); } // First we change the output dir for the new OutputFormat that we will // create // We put it inside the main output work path -> in case the Job fails, // everything will be discarded taskContext.getConfiguration().set("mapred.output.dir", baseOutputCommitter.getBaseDir() + "/" + baseFileName); // This is for Hadoop 2.0 : taskContext.getConfiguration().set("mapreduce.output.fileoutputformat.outputdir", baseOutputCommitter.getBaseDir() + "/" + baseFileName); context.taskAttemptContext = taskContext; // Load the OutputFormat instance OutputFormat outputFormat = InstancesDistributor.loadInstance( context.taskAttemptContext.getConfiguration(), OutputFormat.class, getNamedOutputFormatInstanceFile(this.context, baseFileName), true); // We have to create a JobContext for meeting the contract of the // OutputFormat JobContext jobContext; try { jobContext = JobContextFactory.get(taskContext.getConfiguration(), taskContext.getJobID()); } catch (Exception e) { throw new IOException(e); } context.jobContext = jobContext; // The contract of the OutputFormat is to check the output specs outputFormat.checkOutputSpecs(jobContext); // We get the output committer so we can call it later context.outputCommitter = outputFormat.getOutputCommitter(taskContext); // Save the RecordWriter to cache it context.recordWriter = outputFormat.getRecordWriter(taskContext); // if counters are enabled, wrap the writer with context // to increment counters if (countersEnabled) { context.recordWriter = new RecordWriterWithCounter(context.recordWriter, baseFileName, this.context); } outputContexts.put(baseFileName, context); } return context.recordWriter; }
From source file:com.inmobi.conduit.distcp.tools.mapred.TestCopyOutputFormat.java
License:Apache License
@Test public void testCheckOutputSpecs() { try {//from w ww . java 2 s . c o m OutputFormat outputFormat = new CopyOutputFormat(); Configuration conf = new Configuration(); Job job = new Job(conf); JobID jobID = new JobID("200707121733", 1); try { JobContext context = Mockito.mock(JobContext.class); Mockito.when(context.getConfiguration()).thenReturn(job.getConfiguration()); Mockito.when(context.getJobID()).thenReturn(jobID); outputFormat.checkOutputSpecs(context); Assert.fail("No checking for invalid work/commit path"); } catch (IllegalStateException ignore) { } CopyOutputFormat.setWorkingDirectory(job, new Path("/tmp/work")); try { JobContext context = Mockito.mock(JobContext.class); Mockito.when(context.getConfiguration()).thenReturn(job.getConfiguration()); Mockito.when(context.getJobID()).thenReturn(jobID); outputFormat.checkOutputSpecs(context); Assert.fail("No checking for invalid commit path"); } catch (IllegalStateException ignore) { } job.getConfiguration().set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, ""); CopyOutputFormat.setCommitDirectory(job, new Path("/tmp/commit")); try { JobContext context = Mockito.mock(JobContext.class); Mockito.when(context.getConfiguration()).thenReturn(job.getConfiguration()); Mockito.when(context.getJobID()).thenReturn(jobID); outputFormat.checkOutputSpecs(context); Assert.fail("No checking for invalid work path"); } catch (IllegalStateException ignore) { } CopyOutputFormat.setWorkingDirectory(job, new Path("/tmp/work")); CopyOutputFormat.setCommitDirectory(job, new Path("/tmp/commit")); try { JobContext context = Mockito.mock(JobContext.class); Mockito.when(context.getConfiguration()).thenReturn(job.getConfiguration()); Mockito.when(context.getJobID()).thenReturn(jobID); outputFormat.checkOutputSpecs(context); } catch (IllegalStateException ignore) { ignore.printStackTrace(); Assert.fail("Output spec check failed."); } } catch (IOException e) { LOG.error("Exception encountered while testing checkoutput specs", e); Assert.fail("Checkoutput Spec failure"); } catch (InterruptedException e) { LOG.error("Exception encountered while testing checkoutput specs", e); Assert.fail("Checkoutput Spec failure"); } }
From source file:com.marklogic.contentpump.LocalJobRunner.java
License:Apache License
/** * Run the job. Get the input splits, create map tasks and submit it to * the thread pool if there is one; otherwise, runs the the task one by * one.// w ww . j a v a 2s .c om * * @param <INKEY> * @param <INVALUE> * @param <OUTKEY> * @param <OUTVALUE> * @throws Exception */ @SuppressWarnings("unchecked") public <INKEY, INVALUE, OUTKEY, OUTVALUE, T extends org.apache.hadoop.mapreduce.InputSplit> void run() throws Exception { Configuration conf = job.getConfiguration(); InputFormat<INKEY, INVALUE> inputFormat = (InputFormat<INKEY, INVALUE>) ReflectionUtils .newInstance(job.getInputFormatClass(), conf); List<InputSplit> splits = inputFormat.getSplits(job); T[] array = (T[]) splits.toArray(new org.apache.hadoop.mapreduce.InputSplit[splits.size()]); // sort the splits into order based on size, so that the biggest // goes first Arrays.sort(array, new SplitLengthComparator()); OutputFormat<OUTKEY, OUTVALUE> outputFormat = (OutputFormat<OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(job.getOutputFormatClass(), conf); Class<? extends Mapper<?, ?, ?, ?>> mapperClass = job.getMapperClass(); Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper = (Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils .newInstance(mapperClass, conf); try { outputFormat.checkOutputSpecs(job); } catch (Exception ex) { if (LOG.isDebugEnabled()) { LOG.debug("Error checking output specification: ", ex); } else { LOG.error("Error checking output specification: "); LOG.error(ex.getMessage()); } return; } conf = job.getConfiguration(); progress = new AtomicInteger[splits.size()]; for (int i = 0; i < splits.size(); i++) { progress[i] = new AtomicInteger(); } Monitor monitor = new Monitor(); monitor.start(); reporter = new ContentPumpReporter(); List<Future<Object>> taskList = new ArrayList<Future<Object>>(); for (int i = 0; i < array.length; i++) { InputSplit split = array[i]; if (pool != null) { LocalMapTask<INKEY, INVALUE, OUTKEY, OUTVALUE> task = new LocalMapTask<INKEY, INVALUE, OUTKEY, OUTVALUE>( inputFormat, outputFormat, conf, i, split, reporter, progress[i]); availableThreads = assignThreads(i, array.length); Class<? extends Mapper<?, ?, ?, ?>> runtimeMapperClass = job.getMapperClass(); if (availableThreads > 1 && availableThreads != threadsPerSplit) { // possible runtime adjustment if (runtimeMapperClass != (Class) MultithreadedMapper.class) { runtimeMapperClass = (Class<? extends Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>>) cmd .getRuntimeMapperClass(job, mapperClass, threadsPerSplit, availableThreads); } if (runtimeMapperClass != mapperClass) { task.setMapperClass(runtimeMapperClass); } if (runtimeMapperClass == (Class) MultithreadedMapper.class) { task.setThreadCount(availableThreads); if (LOG.isDebugEnabled()) { LOG.debug("Thread Count for Split#" + i + " : " + availableThreads); } } } if (runtimeMapperClass == (Class) MultithreadedMapper.class) { synchronized (pool) { taskList.add(pool.submit(task)); pool.wait(); } } else { pool.submit(task); } } else { // single-threaded JobID jid = new JobID(); TaskID taskId = new TaskID(jid.getJtIdentifier(), jid.getId(), TaskType.MAP, i); TaskAttemptID taskAttemptId = new TaskAttemptID(taskId, 0); TaskAttemptContext context = ReflectionUtil.createTaskAttemptContext(conf, taskAttemptId); RecordReader<INKEY, INVALUE> reader = inputFormat.createRecordReader(split, context); RecordWriter<OUTKEY, OUTVALUE> writer = outputFormat.getRecordWriter(context); OutputCommitter committer = outputFormat.getOutputCommitter(context); TrackingRecordReader trackingReader = new TrackingRecordReader(reader, progress[i]); Mapper.Context mapperContext = ReflectionUtil.createMapperContext(mapper, conf, taskAttemptId, trackingReader, writer, committer, reporter, split); trackingReader.initialize(split, mapperContext); // no thread pool (only 1 thread specified) Class<? extends Mapper<?, ?, ?, ?>> mapClass = job.getMapperClass(); mapperContext.getConfiguration().setClass(CONF_MAPREDUCE_JOB_MAP_CLASS, mapClass, Mapper.class); mapper = (Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) ReflectionUtils.newInstance(mapClass, mapperContext.getConfiguration()); mapper.run(mapperContext); trackingReader.close(); writer.close(mapperContext); committer.commitTask(context); } } // wait till all tasks are done if (pool != null) { for (Future<Object> f : taskList) { f.get(); } pool.shutdown(); while (!pool.awaitTermination(1, TimeUnit.DAYS)) ; jobComplete.set(true); } monitor.interrupt(); monitor.join(1000); // report counters Iterator<CounterGroup> groupIt = reporter.counters.iterator(); while (groupIt.hasNext()) { CounterGroup group = groupIt.next(); LOG.info(group.getDisplayName() + ": "); Iterator<Counter> counterIt = group.iterator(); while (counterIt.hasNext()) { Counter counter = counterIt.next(); LOG.info(counter.getDisplayName() + ": " + counter.getValue()); } } LOG.info("Total execution time: " + (System.currentTimeMillis() - startTime) / 1000 + " sec"); }
From source file:com.scaleoutsoftware.soss.hserver.JobScheduler.java
License:Apache License
/** * Runs the map-reduce job on ScaleOut hServer. * * @param job the job to run//from www . jav a2 s .c om * @param grid invocation grid to run the job */ @SuppressWarnings("unchecked") void runJob(HServerJob job, InvocationGrid grid) throws IOException, InterruptedException, ClassNotFoundException { //Initialize user credential in advance long time = System.currentTimeMillis(); CreateUserCredentials.run(grid); String hadoopVersion = VersionInfo.getVersion(); try { //Check output specs before running the job OutputFormat outputFormat = ReflectionUtils.newInstance(job.getOutputFormatClass(), job.getConfiguration()); outputFormat.checkOutputSpecs(job); org.apache.hadoop.mapreduce.OutputCommitter outputCommitter = createOutputCommitter(true, job.getJobID(), job.getConfiguration()); //clear all temporary objects DataAccessor.clearObjects(job.getAppId()); //Calculating the partition layout com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping hostNameToPartition = com.scaleoutsoftware.soss.client.util.HostToPartitionsMapping .getCurrent(); List<InetAddress> hostAddresses = new ArrayList<InetAddress>(hostNameToPartition.getHosts()); //Generating mapping of Hadoop partitions to SOSS Regions, so they are equally distributed across hosts int numHosts = hostAddresses.size(); int numberOfSlotsPerNode = Math .max(grid != null ? grid.getMaxNumberOfCores() : Runtime.getRuntime().availableProcessors(), 1); //Set the number of splits to the number of cores if (GridInputFormat.class.isAssignableFrom(job.getInputFormatClass())) { int numberOfSplits = HServerParameters.getSetting(MAP_SPLITS_PER_CORE, job.getConfiguration()) * numHosts * numberOfSlotsPerNode; GridInputFormat.setSuggestedNumberOfSplits(job, Math.min(numberOfSplits, HServerConstants.MAX_MAP_REDUCE_TASKS)); } //Generating split to hostname map InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); List<InputSplit> splitList = inputFormat.getSplits(job); Map<InetAddress, List<Integer>> splitToHostAddress = assignSplitsToHost(splitList, hostAddresses, null); //Choose the optimal number of reducers for GridOutputFormat if (GridOutputFormat.class.isAssignableFrom(job.getOutputFormatClass())) { job.setNumReduceTasks(numHosts * numberOfSlotsPerNode); job.setSortEnabled(false); } int[] partitionMapping = hostNameToPartition.generateEvenItemDistribution(job.getNumReduceTasks()); //Generating invocation parameters Class<? extends InputSplit> splitType = splitList.size() > 0 ? splitList.get(0).getClass() : null; HadoopInvocationParameters hadoopParameters = new HadoopInvocationParameters(job.getConfiguration(), job.getJobID(), false); HServerInvocationParameters parameters = new HServerInvocationParameters(hadoopParameters, job.getAppId(), partitionMapping, hostNameToPartition, numberOfSlotsPerNode, splitType, splitList, splitToHostAddress, false, job.getSortEnabled(), hadoopVersion, job.getJobParameter(), SerializationMode.DEFAULT); StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append("Splits created:\n"); for (InetAddress address : splitToHostAddress.keySet()) { stringBuilder.append("Host "); stringBuilder.append(address); stringBuilder.append(" has "); stringBuilder.append(splitToHostAddress.get(address).size()); stringBuilder.append(" splits.\n"); } System.out.println(stringBuilder.toString()); System.out.println("Job initialization completed in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); InvokeResult<MapperResult> mapInvokeResult = MessagingHelper.invoke(grid, RunMapper.MapperInvokable.class, parameters, TimeSpan.INFINITE_TIMEOUT.getSeconds()); if (mapInvokeResult.getErrors() != null && mapInvokeResult.getErrors().size() > 0) { throw new IOException("Map invocation failed.", mapInvokeResult.getErrors().get(0)); } System.out.println("Map invocation done in " + (System.currentTimeMillis() - time) + " ms."); time = System.currentTimeMillis(); MapperResult resultObject = mapInvokeResult.getResult(); if (resultObject == null || mapInvokeResult.getNumFailed() != 0) { throw new IOException("Mapper invocation failed. Num failed = " + mapInvokeResult.getNumFailed()); } if (resultObject.getNumberOfSplitsProcessed() != splitList.size()) { throw new IOException("Number of splits does not match the number of invocations. Nsplits = " + splitList.size() + ", Ninvokes =" + resultObject.getNumberOfSplitsProcessed()); } if (partitionMapping.length > 0) { //Running the reduce step InvokeResult<Integer> reduceInvokeResult = MessagingHelper.invoke(grid, ReduceInvokable.class, job.getAppId(), TimeSpan.INFINITE_TIMEOUT.getSeconds()); System.out.println("Reduce invocation done in " + (System.currentTimeMillis() - time) + " ms."); DataAccessor.clearObjects(job.getAppId()); //clear all temporary objects if (reduceInvokeResult.getErrors() != null && reduceInvokeResult.getErrors().size() > 0) { throw new IOException("Reduce invocation failed.", reduceInvokeResult.getErrors().get(0)); } if (reduceInvokeResult.getNumFailed() != 0) { throw new IOException("Reduce invocation failed."); } if (reduceInvokeResult.getResult() != partitionMapping.length) { throw new IOException("Not all partitions were reduced. Expected = " + partitionMapping.length + " Actual = " + reduceInvokeResult.getResult()); } } outputCommitter.commitJob(job); } catch (StateServerException e) { throw new IOException("ScaleOut hServer access error.", e); } }
From source file:org.apache.hcatalog.mapreduce.MultiOutputFormat.java
License:Apache License
@Override public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException { for (String alias : getOutputFormatAliases(context)) { LOGGER.debug("Calling checkOutputSpecs for alias: " + alias); JobContext aliasContext = getJobContext(alias, context); OutputFormat<?, ?> outputFormat = getOutputFormatInstance(aliasContext); outputFormat.checkOutputSpecs(aliasContext); // Copy credentials and any new config added back to JobContext context.getCredentials().addAll(aliasContext.getCredentials()); setAliasConf(alias, context, aliasContext); }//from ww w .j ava2 s .co m }
From source file:org.apache.ignite.internal.processors.hadoop.impl.v2.HadoopV2SetupTask.java
License:Apache License
/** {@inheritDoc} */ @SuppressWarnings("ConstantConditions") @Override//from w w w .ja v a 2 s . c om protected void run0(HadoopV2TaskContext taskCtx) throws IgniteCheckedException { try { JobContextImpl jobCtx = taskCtx.jobContext(); OutputFormat outputFormat = getOutputFormat(jobCtx); outputFormat.checkOutputSpecs(jobCtx); OutputCommitter committer = outputFormat.getOutputCommitter(hadoopContext()); if (committer != null) committer.setupJob(jobCtx); } catch (ClassNotFoundException | IOException e) { throw new IgniteCheckedException(e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new IgniteInterruptedCheckedException(e); } }
From source file:org.apache.ignite.internal.processors.hadoop.v2.GridHadoopV2SetupTask.java
License:Apache License
/** {@inheritDoc} */ @SuppressWarnings("ConstantConditions") @Override//from ww w . java 2s . c o m protected void run0(GridHadoopV2TaskContext taskCtx) throws IgniteCheckedException { try { JobContextImpl jobCtx = taskCtx.jobContext(); OutputFormat outputFormat = getOutputFormat(jobCtx); outputFormat.checkOutputSpecs(jobCtx); OutputCommitter committer = outputFormat.getOutputCommitter(hadoopContext()); if (committer != null) committer.setupJob(jobCtx); } catch (ClassNotFoundException | IOException e) { throw new IgniteCheckedException(e); } catch (InterruptedException e) { Thread.currentThread().interrupt(); throw new IgniteInterruptedCheckedException(e); } }
From source file:org.apache.parquet.pig.PerfTest2.java
License:Apache License
public static void write(String out) throws IOException, ParserException, InterruptedException, ExecException { {/*from w w w.ja v a 2 s . co m*/ StringBuilder schemaString = new StringBuilder("a0: chararray"); for (int i = 1; i < COLUMN_COUNT; i++) { schemaString.append(", a" + i + ": chararray"); } String location = out; String schema = schemaString.toString(); StoreFuncInterface storer = new ParquetStorer(); Job job = new Job(conf); storer.setStoreFuncUDFContextSignature("sig"); String absPath = storer.relToAbsPathForStoreLocation(location, new Path(new File(".").getAbsoluteFile().toURI())); storer.setStoreLocation(absPath, job); storer.checkSchema(new ResourceSchema(Utils.getSchemaFromString(schema))); @SuppressWarnings("unchecked") // that's how the base class is defined OutputFormat<Void, Tuple> outputFormat = storer.getOutputFormat(); // it's ContextUtil.getConfiguration(job) and not just conf ! JobContext jobContext = ContextUtil.newJobContext(ContextUtil.getConfiguration(job), new JobID("jt", jobid++)); outputFormat.checkOutputSpecs(jobContext); if (schema != null) { ResourceSchema resourceSchema = new ResourceSchema(Utils.getSchemaFromString(schema)); storer.checkSchema(resourceSchema); if (storer instanceof StoreMetadata) { ((StoreMetadata) storer).storeSchema(resourceSchema, absPath, job); } } TaskAttemptContext taskAttemptContext = ContextUtil.newTaskAttemptContext( ContextUtil.getConfiguration(job), new TaskAttemptID("jt", jobid, true, 1, 0)); RecordWriter<Void, Tuple> recordWriter = outputFormat.getRecordWriter(taskAttemptContext); storer.prepareToWrite(recordWriter); for (int i = 0; i < ROW_COUNT; i++) { Tuple tuple = TupleFactory.getInstance().newTuple(COLUMN_COUNT); for (int j = 0; j < COLUMN_COUNT; j++) { tuple.set(j, "a" + i + "_" + j); } storer.putNext(tuple); } recordWriter.close(taskAttemptContext); OutputCommitter outputCommitter = outputFormat.getOutputCommitter(taskAttemptContext); outputCommitter.commitTask(taskAttemptContext); outputCommitter.commitJob(jobContext); } }
From source file:org.apache.pig.backend.hadoop.executionengine.fetch.FetchPOStoreImpl.java
License:Apache License
@Override public StoreFuncInterface createStoreFunc(POStore store) throws IOException { Configuration conf = ConfigurationUtil.toConfiguration(pc.getProperties()); StoreFuncInterface storeFunc = store.getStoreFunc(); JobContext jc = HadoopShims.createJobContext(conf, new JobID()); OutputFormat<?, ?> outputFormat = storeFunc.getOutputFormat(); PigOutputFormat.setLocation(jc, store); context = HadoopShims.createTaskAttemptContext(conf, HadoopShims.getNewTaskAttemptID()); PigOutputFormat.setLocation(context, store); try {/*from w w w .j a v a 2 s . c o m*/ outputFormat.checkOutputSpecs(jc); } catch (InterruptedException e) { throw new IOException(e); } try { outputCommitter = outputFormat.getOutputCommitter(context); outputCommitter.setupJob(jc); outputCommitter.setupTask(context); writer = outputFormat.getRecordWriter(context); } catch (InterruptedException e) { throw new IOException(e); } storeFunc.prepareToWrite(writer); return storeFunc; }