Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputs.java

License:Apache License

private static Job getNamedJob(JobContext context, String namedOutput) throws IOException {
    // The following trick leverages the instantiation of a record writer via
    // the job thus supporting arbitrary output formats.
    Job job = Job.getInstance(context.getConfiguration());
    job.setOutputFormatClass(getNamedOutputFormatClass(context, namedOutput));
    job.setOutputKeyClass(getNamedOutputKeyClass(context, namedOutput));
    job.setOutputValueClass(getNamedOutputValueClass(context, namedOutput));

    Configuration conf = job.getConfiguration();
    Map<String, String> namedConfigurations = ConfigurationUtil
            .getNamedConfigurations(context.getConfiguration(), computePrefixName(namedOutput));
    ConfigurationUtil.setAll(namedConfigurations, conf);
    return job;/*from   w  ww  .j  a  v a2s  . c  o  m*/
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputsMainOutputWrapper.java

License:Apache License

private OutputFormat<K, V> getRootOutputFormat(JobContext context) {
    if (innerFormat == null) {
        Configuration conf = context.getConfiguration();
        @SuppressWarnings("unchecked")
        Class<OutputFormat<K, V>> c = (Class<OutputFormat<K, V>>) conf.getClass(ROOT_OUTPUT_FORMAT,
                FileOutputFormat.class);
        try {/* w w  w. j a  v a 2 s .  c  o m*/
            innerFormat = c.newInstance();
        } catch (InstantiationException | IllegalAccessException e) {
            throw new RuntimeException(e);
        }
    }
    return innerFormat;
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java

License:Apache License

@Override
public void commitJob(JobContext context) throws IOException {
    Configuration configuration = context.getConfiguration();
    MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
    BasicMapReduceTaskContext taskContext = classLoader.getTaskContextProvider().get(this.taskContext);

    String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
    PartitionedFileSet outputDataset = taskContext.getDataset(outputDatasetName);
    Partitioning partitioning = outputDataset.getPartitioning();

    Set<PartitionKey> partitionsToAdd = new HashSet<>();
    Set<String> relativePaths = new HashSet<>();
    // Go over all files in the temporary directory and keep track of partitions to add for them
    FileStatus[] allCommittedTaskPaths = getAllCommittedTaskPaths(context);
    for (FileStatus committedTaskPath : allCommittedTaskPaths) {
        FileSystem fs = committedTaskPath.getPath().getFileSystem(configuration);
        RemoteIterator<LocatedFileStatus> fileIter = fs.listFiles(committedTaskPath.getPath(), true);
        while (fileIter.hasNext()) {
            Path path = fileIter.next().getPath();
            String relativePath = getRelative(committedTaskPath.getPath(), path);

            int lastPathSepIdx = relativePath.lastIndexOf(Path.SEPARATOR);
            if (lastPathSepIdx == -1) {
                // this shouldn't happen because each relative path should consist of at least one partition key and
                // the output file name
                LOG.warn("Skipping path '{}'. It's relative path '{}' has fewer than two parts", path,
                        relativePath);//from  w  w w . j  av a 2 s .co  m
                continue;
            }
            // relativePath = "../key1/key2/part-m-00000"
            // relativeDir = "../key1/key2"
            // fileName = "part-m-00000"
            String relativeDir = relativePath.substring(0, lastPathSepIdx);
            String fileName = relativePath.substring(lastPathSepIdx + 1);

            Path finalDir = new Path(FileOutputFormat.getOutputPath(context), relativeDir);
            Path finalPath = new Path(finalDir, fileName);
            if (fs.exists(finalPath)) {
                throw new FileAlreadyExistsException("Final output path " + finalPath + " already exists");
            }
            PartitionKey partitionKey = getPartitionKey(partitioning, relativeDir);
            partitionsToAdd.add(partitionKey);
            relativePaths.add(relativeDir);
        }
    }

    // We need to copy to the parent of the FileOutputFormat's outputDir, since we added a _temporary_jobId suffix to
    // the original outputDir.
    Path finalOutput = FileOutputFormat.getOutputPath(context);
    FileSystem fs = finalOutput.getFileSystem(configuration);
    for (FileStatus stat : getAllCommittedTaskPaths(context)) {
        mergePaths(fs, stat, finalOutput);
    }

    // compute the metadata to be written to every output partition
    Map<String, String> metadata = ConfigurationUtil.getNamedConfigurations(this.taskContext.getConfiguration(),
            PartitionedFileSetArguments.OUTPUT_PARTITION_METADATA_PREFIX);

    // create all the necessary partitions
    for (PartitionKey partitionKey : partitionsToAdd) {
        PartitionOutput partitionOutput = outputDataset.getPartitionOutput(partitionKey);
        partitionOutput.setMetadata(metadata);
        partitionOutput.addPartition();
    }

    // close the TaskContext, which flushes dataset operations
    try {
        taskContext.flushOperations();
    } catch (Exception e) {
        Throwables.propagateIfPossible(e, IOException.class);
        throw new IOException(e);
    }

    // delete the job-specific _temporary folder and create a _done file in the o/p folder
    cleanupJob(context);

    // mark all the final output paths with a _SUCCESS file, if configured to do so (default = true)
    if (configuration.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER, true)) {
        for (String relativePath : relativePaths) {
            Path pathToMark = new Path(finalOutput, relativePath);
            Path markerPath = new Path(pathToMark, SUCCEEDED_FILE_NAME);
            fs.createNewFile(markerPath);
        }
    }
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java

License:Apache License

@Override
public void cleanupJob(JobContext context) throws IOException {
    FileSystem fs = jobSpecificOutputPath.getFileSystem(context.getConfiguration());
    fs.delete(jobSpecificOutputPath, true);
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java

License:Apache License

/**
 * Get a list of all paths where output from committed tasks are stored.
 * @param context the context of the current job
 * @return the list of these Paths/FileStatuses.
 * @throws IOException/*from  w  w w  .jav a 2  s.c  o  m*/
 */
private FileStatus[] getAllCommittedTaskPaths(JobContext context) throws IOException {
    Path jobAttemptPath = getJobAttemptPath(context);
    FileSystem fs = jobAttemptPath.getFileSystem(context.getConfiguration());
    return fs.listStatus(jobAttemptPath, new CommittedTaskFilter());
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputFormat.java

License:Apache License

@Override
public void checkOutputSpecs(JobContext job) throws IOException {
    // Ensure that the output directory is set and not already there
    Path outDir = getOutputPath(job);
    if (outDir == null) {
        throw new InvalidJobConfException("Output directory not set.");
    }/*from   ww w  .j  ava 2s. c  o m*/

    // get delegation token for outDir's file system
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, job.getConfiguration());

    // we permit multiple jobs writing to the same output directory. We handle this by each one writing to distinct
    // paths within that directory. See createJobSpecificPath method and usages of it.

    // additionally check that output dataset and dynamic partitioner class name has been set in conf
    if (job.getConfiguration().get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET) == null) {
        throw new InvalidJobConfException("The job configuration does not contain required property: "
                + Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
    }

    Class<? extends DynamicPartitioner> partitionerClass = job.getConfiguration().getClass(
            PartitionedFileSetArguments.DYNAMIC_PARTITIONER_CLASS_NAME, null, DynamicPartitioner.class);
    if (partitionerClass == null) {
        throw new InvalidJobConfException("The job configuration does not contain required property: "
                + PartitionedFileSetArguments.DYNAMIC_PARTITIONER_CLASS_NAME);
    }

    Class<? extends FileOutputFormat> delegateOutputFormatClass = job.getConfiguration().getClass(
            Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_FORMAT_CLASS_NAME, null, FileOutputFormat.class);
    if (delegateOutputFormatClass == null) {
        throw new InvalidJobConfException("The job configuration does not contain required property: "
                + Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_FORMAT_CLASS_NAME);
    }
}

From source file:co.cask.cdap.internal.app.runtime.spark.dataset.SparkDatasetInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(final JobContext context) throws IOException, InterruptedException {
    ExecutionSparkContext sparkContext = SparkContextProvider.getSparkContext();

    Configuration configuration = context.getConfiguration();
    Map<String, String> arguments = GSON.fromJson(configuration.get(INPUT_DATASET_ARGS), ARGS_TYPE);
    BatchReadable<?, ?> batchReadable = sparkContext.getBatchReadable(configuration.get(INPUT_DATASET_NAME),
            arguments);/*from   w  w  w  . ja v  a  2s  .  co  m*/

    List<Split> splits = batchReadable.getSplits();
    List<InputSplit> list = new ArrayList<>(splits.size());
    for (Split split : splits) {
        list.add(new DataSetInputSplit(split));
    }
    return list;
}

From source file:co.cask.hydrator.plugin.batch.CopybookInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    Configuration conf = context.getConfiguration();
    Path path = new Path(conf.get(COPYBOOK_INPUTFORMAT_DATA_HDFS_PATH));
    final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(path);
    return (null == codec) ? true : codec instanceof SplittableCompressionCodec;
}

From source file:co.cask.hydrator.plugin.batchSource.KafkaInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    Gson gson = new Gson();
    List<KafkaRequest> finalRequests = gson.fromJson(context.getConfiguration().get(KAFKA_REQUEST), LIST_TYPE);
    List<InputSplit> kafkaSplits = new ArrayList<>();

    for (KafkaRequest r : finalRequests) {
        KafkaSplit split = new KafkaSplit(r);
        kafkaSplits.add(split);/* w  w w  .  ja  va2 s.c om*/
    }

    return kafkaSplits;
}

From source file:co.nubetech.apache.hadoop.DataDrivenDBInputFormat.java

License:Apache License

/** {@inheritDoc} */
public List<InputSplit> getSplits(JobContext job) throws IOException {

    int targetNumTasks = job.getConfiguration().getInt(MRJobConfig.NUM_MAPS, 1);
    if (1 == targetNumTasks) {
        // There's no need to run a bounding vals query; just return a split
        // that separates nothing. This can be considerably more optimal for
        // a/*from w  w w .  j  a v  a2 s.  co  m*/
        // large table with no index.
        List<InputSplit> singletonSplit = new ArrayList<InputSplit>();
        singletonSplit.add(new DataDrivenDBInputSplit("1=1", "1=1"));
        return singletonSplit;
    }

    ResultSet results = null;
    Statement statement = null;
    Connection connection = getConnection();
    try {
        statement = connection.createStatement();

        results = statement.executeQuery(getBoundingValsQuery());
        results.next();

        // Based on the type of the results, use a different mechanism
        // for interpolating split points (i.e., numeric splits, text
        // splits,
        // dates, etc.)
        int sqlDataType = results.getMetaData().getColumnType(1);
        DBSplitter splitter = getSplitter(sqlDataType);
        if (null == splitter) {
            throw new IOException("Unknown SQL data type: " + sqlDataType);
        }

        return splitter.split(job.getConfiguration(), results, getDBConf().getInputOrderBy());
    } catch (SQLException e) {
        throw new IOException(e.getMessage());
    } finally {
        // More-or-less ignore SQL exceptions here, but log in case we need
        // it.
        try {
            if (null != results) {
                results.close();
            }
        } catch (SQLException se) {
            LOG.debug("SQLException closing resultset: " + se.toString());
        }

        try {
            if (null != statement) {
                statement.close();
            }
        } catch (SQLException se) {
            LOG.debug("SQLException closing statement: " + se.toString());
        }

        try {
            connection.commit();
            closeConnection();
        } catch (SQLException se) {
            LOG.debug("SQLException committing split transaction: " + se.toString());
        }
    }
}