List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputs.java
License:Apache License
private static Job getNamedJob(JobContext context, String namedOutput) throws IOException { // The following trick leverages the instantiation of a record writer via // the job thus supporting arbitrary output formats. Job job = Job.getInstance(context.getConfiguration()); job.setOutputFormatClass(getNamedOutputFormatClass(context, namedOutput)); job.setOutputKeyClass(getNamedOutputKeyClass(context, namedOutput)); job.setOutputValueClass(getNamedOutputValueClass(context, namedOutput)); Configuration conf = job.getConfiguration(); Map<String, String> namedConfigurations = ConfigurationUtil .getNamedConfigurations(context.getConfiguration(), computePrefixName(namedOutput)); ConfigurationUtil.setAll(namedConfigurations, conf); return job;/*from w ww .j a v a2s . c o m*/ }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputsMainOutputWrapper.java
License:Apache License
private OutputFormat<K, V> getRootOutputFormat(JobContext context) { if (innerFormat == null) { Configuration conf = context.getConfiguration(); @SuppressWarnings("unchecked") Class<OutputFormat<K, V>> c = (Class<OutputFormat<K, V>>) conf.getClass(ROOT_OUTPUT_FORMAT, FileOutputFormat.class); try {/* w w w. j a v a 2 s . c o m*/ innerFormat = c.newInstance(); } catch (InstantiationException | IllegalAccessException e) { throw new RuntimeException(e); } } return innerFormat; }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java
License:Apache License
@Override public void commitJob(JobContext context) throws IOException { Configuration configuration = context.getConfiguration(); MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration); BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext); String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET); PartitionedFileSet outputDataset = taskContext.getDataset(outputDatasetName); Partitioning partitioning = outputDataset.getPartitioning(); Set<PartitionKey> partitionsToAdd = new HashSet<>(); Set<String> relativePaths = new HashSet<>(); // Go over all files in the temporary directory and keep track of partitions to add for them FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context); for (FileStatus committedTaskPath : allCommittedTaskPaths) { FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration); RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true); while (fileIter.hasNext()) { Path path = fileIter.next().getPath(); String relativePath = getRelative(committedTaskPath.getPath(), path); int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR); if (lastPathSepIdx == -1) { // this shouldn't happen because each relative path should consist of at least one partition key and // the output file name LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path, relativePath);//from w w w . j av a 2 s .co m continue; } // relativePath = "../key1/key2/part-m-00000" // relativeDir = "../key1/key2" // fileName = "part-m-00000" String relativeDir = relativePath.substring(0, lastPathSepIdx); String fileName = relativePath.substring(lastPathSepIdx + 1); Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir); Path finalPath = new Path(finalDir, fileName); if (fs.exists(finalPath)) { throw new FileAlreadyExistsException("Final output path " + finalPath + " already exists"); } PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir); partitionsToAdd.add(partitionKey); relativePaths.add(relativeDir); } } // We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to // the original outputDir. Path finalOutput = FileOutputFormat.getOutputPath(context); FileSystem fs = finalOutput.getFileSystem(configuration); for (FileStatus stat : getAllCommittedTaskPaths(context)) { mergePaths(fs, stat, finalOutput); } // compute the metadata to be written to every output partition Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(), PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX); // create all the necessary partitions for (PartitionKey partitionKey : partitionsToAdd) { PartitionOutput partitionOutput = outputDataset.getPartitionOutput(partitionKey); partitionOutput.setMetadata(metadata); partitionOutput.addPartition(); } // close the TaskContext, which flushes dataset operations try { taskContext.flushOperations(); } catch (Exception e) { Throwables.propagateIfPossible(e, IOException.class); throw new IOException(e); } // delete the job-specific _temporary folder and create a _done file in the o/p folder cleanupJob(context); // mark all the final output paths with a _SUCCESS file, if configured to do so (default = true) if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) { for (String relativePath : relativePaths) { Path pathToMark = new Path(finalOutput, relativePath); Path markerPath = new Path(pathToMark, SUCCEEDED_FILE_NAME); fs.createNewFile(markerPath); } } }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java
License:Apache License
@Override public void cleanupJob(JobContext context) throws IOException { FileSystem fs = jobSpecificOutputPath.getFileSystem(context.getConfiguration()); fs.delete(jobSpecificOutputPath, true); }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java
License:Apache License
/** * Get a list of all paths where output from committed tasks are stored. * @param context the context of the current job * @return the list of these Paths/FileStatuses. * @throws IOException/*from w w w .jav a 2 s.c o m*/ */ private FileStatus[] getAllCommittedTaskPaths(JobContext context) throws IOException { Path jobAttemptPath = getJobAttemptPath(context); FileSystem fs = jobAttemptPath.getFileSystem(context.getConfiguration()); return fs.listStatus(jobAttemptPath, new CommittedTaskFilter()); }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputFormat.java
License:Apache License
@Override public void checkOutputSpecs(JobContext job) throws IOException { // Ensure that the output directory is set and not already there Path outDir = getOutputPath(job); if (outDir == null) { throw new InvalidJobConfException("Output directory not set."); }/*from ww w .j ava 2s. c o m*/ // get delegation token for outDir's file system TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, job.getConfiguration()); // we permit multiple jobs writing to the same output directory. We handle this by each one writing to distinct // paths within that directory. See createJobSpecificPath method and usages of it. // additionally check that output dataset and dynamic partitioner class name has been set in conf if (job.getConfiguration().get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET) == null) { throw new InvalidJobConfException("The job configuration does not contain required property: " + Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET); } Class<? extends DynamicPartitioner> partitionerClass = job.getConfiguration().getClass( PartitionedFileSetArguments.DYNAMIC_PARTITIONER_CLASS_NAME, null, DynamicPartitioner.class); if (partitionerClass == null) { throw new InvalidJobConfException("The job configuration does not contain required property: " + PartitionedFileSetArguments.DYNAMIC_PARTITIONER_CLASS_NAME); } Class<? extends FileOutputFormat> delegateOutputFormatClass = job.getConfiguration().getClass( Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_FORMAT_CLASS_NAME, null, FileOutputFormat.class); if (delegateOutputFormatClass == null) { throw new InvalidJobConfException("The job configuration does not contain required property: " + Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_FORMAT_CLASS_NAME); } }
From source file:co.cask.cdap.internal.app.runtime.spark.dataset.SparkDatasetInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(final JobContext context) throws IOException, InterruptedException { ExecutionSparkContext sparkContext = SparkContextProvider.getSparkContext(); Configuration configuration = context.getConfiguration(); Map<String, String> arguments = GSON.fromJson(configuration.get(INPUT_DATASET_ARGS), ARGS_TYPE); BatchReadable<?, ?> batchReadable = sparkContext.getBatchReadable(configuration.get(INPUT_DATASET_NAME), arguments);/*from w w w . ja v a 2s . co m*/ List<Split> splits = batchReadable.getSplits(); List<InputSplit> list = new ArrayList<>(splits.size()); for (Split split : splits) { list.add(new DataSetInputSplit(split)); } return list; }
From source file:co.cask.hydrator.plugin.batch.CopybookInputFormat.java
License:Apache License
@Override protected boolean isSplitable(JobContext context, Path file) { Configuration conf = context.getConfiguration(); Path path = new Path(conf.get(COPYBOOK_INPUTFORMAT_DATA_HDFS_PATH)); final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path); return (null == codec) ? true : codec instanceof SplittableCompressionCodec; }
From source file:co.cask.hydrator.plugin.batchSource.KafkaInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { Gson gson = new Gson(); List<KafkaRequest> finalRequests = gson.fromJson(context.getConfiguration().get(KAFKA_REQUEST), LIST_TYPE); List<InputSplit> kafkaSplits = new ArrayList<>(); for (KafkaRequest r : finalRequests) { KafkaSplit split = new KafkaSplit(r); kafkaSplits.add(split);/* w w w . ja va2 s.c om*/ } return kafkaSplits; }
From source file:co.nubetech.apache.hadoop.DataDrivenDBInputFormat.java
License:Apache License
/** {@inheritDoc} */ public List<InputSplit> getSplits(JobContext job) throws IOException { int targetNumTasks = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1); if (1 == targetNumTasks) { // There's no need to run a bounding vals query; just return a split // that separates nothing. This can be considerably more optimal for // a/*from w w w . j a v a2 s. co m*/ // large table with no index. List<InputSplit> singletonSplit = new ArrayList<InputSplit>(); singletonSplit.add(new DataDrivenDBInputSplit("1=1", "1=1")); return singletonSplit; } ResultSet results = null; Statement statement = null; Connection connection = getConnection(); try { statement = connection.createStatement(); results = statement.executeQuery(getBoundingValsQuery()); results.next(); // Based on the type of the results, use a different mechanism // for interpolating split points (i.e., numeric splits, text // splits, // dates, etc.) int sqlDataType = results.getMetaData().getColumnType(1); DBSplitter splitter = getSplitter(sqlDataType); if (null == splitter) { throw new IOException("Unknown SQL data type: " + sqlDataType); } return splitter.split(job.getConfiguration(), results, getDBConf().getInputOrderBy()); } catch (SQLException e) { throw new IOException(e.getMessage()); } finally { // More-or-less ignore SQL exceptions here, but log in case we need // it. try { if (null != results) { results.close(); } } catch (SQLException se) { LOG.debug("SQLException closing resultset: " + se.toString()); } try { if (null != statement) { statement.close(); } } catch (SQLException se) { LOG.debug("SQLException closing statement: " + se.toString()); } try { connection.commit(); closeConnection(); } catch (SQLException se) { LOG.debug("SQLException committing split transaction: " + se.toString()); } } }