Example usage for org.apache.hadoop.mapreduce MRJobConfig OUTPUT_FORMAT_CLASS

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce MRJobConfig OUTPUT_FORMAT_CLASS_ATTR.

Prototype

String OUTPUT_FORMAT_CLASS_ATTR

To view the source code for org.apache.hadoop.mapreduce MRJobConfig OUTPUT_FORMAT_CLASS_ATTR.

Click Source Link

Usage

From source file:co.cask.cdap.etl.batch.spark.SparkBatchSinkFactory.java

License:Apache License

<K, V> void writeFromRDD(JavaPairRDD<K, V> rdd, JavaSparkExecutionContext sec, String sinkName,
        Class<K> keyClass, Class<V> valueClass) {
    Set<String> outputNames = sinkOutputs.get(sinkName);
    if (outputNames == null || outputNames.size() == 0) {
        // should never happen if validation happened correctly at pipeline configure time
        throw new IllegalArgumentException(
                sinkName + " has no outputs. " + "Please check that the sink calls addOutput at some point.");
    }/*from w  ww . j a v a 2 s .  co m*/

    for (String outputName : outputNames) {
        OutputFormatProvider outputFormatProvider = outputFormatProviders.get(outputName);
        if (outputFormatProvider != null) {
            Configuration hConf = new Configuration();
            hConf.clear();
            for (Map.Entry<String, String> entry : outputFormatProvider.getOutputFormatConfiguration()
                    .entrySet()) {
                hConf.set(entry.getKey(), entry.getValue());
            }
            hConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, outputFormatProvider.getOutputFormatClassName());
            rdd.saveAsNewAPIHadoopDataset(hConf);
        }

        DatasetInfo datasetInfo = datasetInfos.get(outputName);
        if (datasetInfo != null) {
            sec.saveAsDataset(rdd, datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs());
        }
    }
}

From source file:co.cask.cdap.etl.spark.batch.SparkBatchSinkFactory.java

License:Apache License

public <K, V> void writeFromRDD(JavaPairRDD<K, V> rdd, JavaSparkExecutionContext sec, String sinkName,
        Class<K> keyClass, Class<V> valueClass) {
    Set<String> outputNames = sinkOutputs.get(sinkName);
    if (outputNames == null || outputNames.isEmpty()) {
        // should never happen if validation happened correctly at pipeline configure time
        throw new IllegalArgumentException(
                sinkName + " has no outputs. " + "Please check that the sink calls addOutput at some point.");
    }//from  ww  w.  j a  va 2 s.c o m

    for (String outputName : outputNames) {
        OutputFormatProvider outputFormatProvider = outputFormatProviders.get(outputName);
        if (outputFormatProvider != null) {
            Configuration hConf = new Configuration();
            hConf.clear();
            for (Map.Entry<String, String> entry : outputFormatProvider.getOutputFormatConfiguration()
                    .entrySet()) {
                hConf.set(entry.getKey(), entry.getValue());
            }
            hConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, outputFormatProvider.getOutputFormatClassName());
            rdd.saveAsNewAPIHadoopDataset(hConf);
        }

        DatasetInfo datasetInfo = datasetInfos.get(outputName);
        if (datasetInfo != null) {
            sec.saveAsDataset(rdd, datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs());
        }
    }
}

From source file:co.cask.cdap.internal.app.runtime.spark.AbstractSparkContext.java

License:Apache License

/**
 * Sets the output {@link Dataset} with splits in the {@link Configuration}
 *
 * @param datasetName the name of the {@link Dataset} to write to
 * @return updated {@link Configuration}
 *//* w w w. j  a  v  a  2 s. c o m*/
Configuration setOutputDataset(String datasetName) {
    Configuration hConf = new Configuration(getHConf());
    hConf.set(SparkDatasetOutputFormat.HCONF_ATTR_OUTPUT_DATASET, datasetName);
    hConf.setClass(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, SparkDatasetOutputFormat.class, OutputFormat.class);
    return hConf;
}

From source file:co.cask.cdap.internal.app.runtime.spark.JavaSparkFacade.java

License:Apache License

@SuppressWarnings("unchecked")
@Override//  w w  w.j a va2s  . com
public <R, K, V> void saveAsDataset(R rdd, Class<? extends OutputFormat> outputFormatClass, Class<K> keyClass,
        Class<V> valueClass, Configuration hConf) {
    Preconditions.checkArgument(rdd instanceof JavaPairRDD, "RDD class %s is not a subclass of %s",
            rdd.getClass().getName(), JavaPairRDD.class.getName());
    hConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, outputFormatClass.getName());
    ((JavaPairRDD<K, V>) rdd).saveAsNewAPIHadoopDataset(hConf);
}

From source file:co.cask.cdap.internal.app.runtime.spark.ScalaSparkFacade.java

License:Apache License

@SuppressWarnings("unchecked")
@Override/*from   w ww .  j a v  a2 s.  c  om*/
public <R, K, V> void saveAsDataset(R rdd, Class<? extends OutputFormat> outputFormatClass, Class<K> keyClass,
        Class<V> valueClass, Configuration hConf) {
    Preconditions.checkArgument(rdd instanceof RDD, "RDD class %s is not a subclass of %s",
            rdd.getClass().getName(), RDD.class.getName());

    hConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, outputFormatClass.getName());

    ClassTag<K> kClassTag = ClassTag$.MODULE$.apply(keyClass);
    ClassTag<V> vClassTag = ClassTag$.MODULE$.apply(valueClass);

    PairRDDFunctions<K, V> pairRDD = new PairRDDFunctions<K, V>((RDD<Tuple2<K, V>>) rdd, kClassTag, vClassTag,
            null);
    pairRDD.saveAsNewAPIHadoopDataset(hConf);
}

From source file:org.apache.beam.sdk.io.hadoop.format.HadoopFormatIOTest.java

License:Apache License

private static Configuration loadTestConfiguration(Class<?> outputFormatClassName, Class<?> keyClass,
        Class<?> valueClass) {
    Configuration conf = new Configuration();
    conf.setClass(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, outputFormatClassName, OutputFormat.class);
    conf.setClass(MRJobConfig.OUTPUT_KEY_CLASS, keyClass, Object.class);
    conf.setClass(MRJobConfig.OUTPUT_VALUE_CLASS, valueClass, Object.class);
    conf.setInt(MRJobConfig.NUM_REDUCES, REDUCERS_COUNT);
    conf.set(MRJobConfig.ID, String.valueOf(1));
    return conf;/*from  w w  w  .j  a  v  a 2  s  . co m*/
}

From source file:org.apache.beam.sdk.io.hadoop.format.HadoopFormats.java

License:Apache License

/**
 * Returns instance of {@link OutputFormat} by class name stored in the configuration under key
 * {@link MRJobConfig#OUTPUT_FORMAT_CLASS_ATTR}.
 *
 * @param conf Hadoop configuration//from  w w w.  j a v  a 2 s. c  o  m
 * @param <KeyT> KeyType of output format
 * @param <ValueT> ValueType of output format
 * @return OutputFormatter
 * @throws IllegalArgumentException if particular key was not found in the config or Formatter was
 *     unable to construct.
 */
@SuppressWarnings("unchecked")
static <KeyT, ValueT> OutputFormat<KeyT, ValueT> createOutputFormatFromConfig(Configuration conf)
        throws IllegalArgumentException {
    return (OutputFormat<KeyT, ValueT>) createInstanceFromConfig(conf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR,
            null, OutputFormat.class);
}

From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopUtils.java

License:Apache License

/**
 * Creates JobInfo from hadoop configuration.
 *
 * @param cfg Hadoop configuration./* w  ww  . j  a  v  a 2 s  .co m*/
 * @return Job info.
 * @throws IgniteCheckedException If failed.
 */
public static GridHadoopDefaultJobInfo createJobInfo(Configuration cfg) throws IgniteCheckedException {
    JobConf jobConf = new JobConf(cfg);

    boolean hasCombiner = jobConf.get("mapred.combiner.class") != null
            || jobConf.get(MRJobConfig.COMBINE_CLASS_ATTR) != null;

    int numReduces = jobConf.getNumReduceTasks();

    jobConf.setBooleanIfUnset("mapred.mapper.new-api", jobConf.get(OLD_MAP_CLASS_ATTR) == null);

    if (jobConf.getUseNewMapper()) {
        String mode = "new map API";

        ensureNotSet(jobConf, "mapred.input.format.class", mode);
        ensureNotSet(jobConf, OLD_MAP_CLASS_ATTR, mode);

        if (numReduces != 0)
            ensureNotSet(jobConf, "mapred.partitioner.class", mode);
        else
            ensureNotSet(jobConf, "mapred.output.format.class", mode);
    } else {
        String mode = "map compatibility";

        ensureNotSet(jobConf, MRJobConfig.INPUT_FORMAT_CLASS_ATTR, mode);
        ensureNotSet(jobConf, MRJobConfig.MAP_CLASS_ATTR, mode);

        if (numReduces != 0)
            ensureNotSet(jobConf, MRJobConfig.PARTITIONER_CLASS_ATTR, mode);
        else
            ensureNotSet(jobConf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode);
    }

    if (numReduces != 0) {
        jobConf.setBooleanIfUnset("mapred.reducer.new-api", jobConf.get(OLD_REDUCE_CLASS_ATTR) == null);

        if (jobConf.getUseNewReducer()) {
            String mode = "new reduce API";

            ensureNotSet(jobConf, "mapred.output.format.class", mode);
            ensureNotSet(jobConf, OLD_REDUCE_CLASS_ATTR, mode);
        } else {
            String mode = "reduce compatibility";

            ensureNotSet(jobConf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode);
            ensureNotSet(jobConf, MRJobConfig.REDUCE_CLASS_ATTR, mode);
        }
    }

    Map<String, String> props = new HashMap<>();

    for (Map.Entry<String, String> entry : jobConf)
        props.put(entry.getKey(), entry.getValue());

    return new GridHadoopDefaultJobInfo(jobConf.getJobName(), jobConf.getUser(), hasCombiner, numReduces,
            props);
}

From source file:org.apache.ignite.internal.processors.hadoop.HadoopUtils.java

License:Apache License

/**
 * Creates JobInfo from hadoop configuration.
 *
 * @param cfg Hadoop configuration.//from www.j  a  v a 2  s  . c  om
 * @return Job info.
 * @throws IgniteCheckedException If failed.
 */
public static HadoopDefaultJobInfo createJobInfo(Configuration cfg) throws IgniteCheckedException {
    JobConf jobConf = new JobConf(cfg);

    boolean hasCombiner = jobConf.get("mapred.combiner.class") != null
            || jobConf.get(MRJobConfig.COMBINE_CLASS_ATTR) != null;

    int numReduces = jobConf.getNumReduceTasks();

    jobConf.setBooleanIfUnset("mapred.mapper.new-api", jobConf.get(OLD_MAP_CLASS_ATTR) == null);

    if (jobConf.getUseNewMapper()) {
        String mode = "new map API";

        ensureNotSet(jobConf, "mapred.input.format.class", mode);
        ensureNotSet(jobConf, OLD_MAP_CLASS_ATTR, mode);

        if (numReduces != 0)
            ensureNotSet(jobConf, "mapred.partitioner.class", mode);
        else
            ensureNotSet(jobConf, "mapred.output.format.class", mode);
    } else {
        String mode = "map compatibility";

        ensureNotSet(jobConf, MRJobConfig.INPUT_FORMAT_CLASS_ATTR, mode);
        ensureNotSet(jobConf, MRJobConfig.MAP_CLASS_ATTR, mode);

        if (numReduces != 0)
            ensureNotSet(jobConf, MRJobConfig.PARTITIONER_CLASS_ATTR, mode);
        else
            ensureNotSet(jobConf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode);
    }

    if (numReduces != 0) {
        jobConf.setBooleanIfUnset("mapred.reducer.new-api", jobConf.get(OLD_REDUCE_CLASS_ATTR) == null);

        if (jobConf.getUseNewReducer()) {
            String mode = "new reduce API";

            ensureNotSet(jobConf, "mapred.output.format.class", mode);
            ensureNotSet(jobConf, OLD_REDUCE_CLASS_ATTR, mode);
        } else {
            String mode = "reduce compatibility";

            ensureNotSet(jobConf, MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, mode);
            ensureNotSet(jobConf, MRJobConfig.REDUCE_CLASS_ATTR, mode);
        }
    }

    Map<String, String> props = new HashMap<>();

    for (Map.Entry<String, String> entry : jobConf)
        props.put(entry.getKey(), entry.getValue());

    return new HadoopDefaultJobInfo(jobConf.getJobName(), jobConf.getUser(), hasCombiner, numReduces, props);
}

From source file:org.apache.tez.mapreduce.examples.MRRSleepJob.java

License:Apache License

public DAG createDAG(FileSystem remoteFs, Configuration conf, Path remoteStagingDir, int numMapper,
        int numReducer, int iReduceStagesCount, int numIReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount, long iReduceSleepTime, int iReduceSleepCount,
        boolean writeSplitsToDFS, boolean generateSplitsInAM) throws IOException, YarnException {

    Configuration mapStageConf = new JobConf(conf);
    mapStageConf.setInt(MRJobConfig.NUM_MAPS, numMapper);
    mapStageConf.setLong(MAP_SLEEP_TIME, mapSleepTime);
    mapStageConf.setLong(REDUCE_SLEEP_TIME, reduceSleepTime);
    mapStageConf.setLong(IREDUCE_SLEEP_TIME, iReduceSleepTime);
    mapStageConf.setInt(MAP_SLEEP_COUNT, mapSleepCount);
    mapStageConf.setInt(REDUCE_SLEEP_COUNT, reduceSleepCount);
    mapStageConf.setInt(IREDUCE_SLEEP_COUNT, iReduceSleepCount);
    mapStageConf.setInt(IREDUCE_STAGES_COUNT, iReduceStagesCount);
    mapStageConf.setInt(IREDUCE_TASKS_COUNT, numIReducer);
    mapStageConf.set(MRJobConfig.MAP_CLASS_ATTR, SleepMapper.class.getName());
    mapStageConf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, SleepInputFormat.class.getName());
    if (numIReducer == 0 && numReducer == 0) {
        mapStageConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, NullOutputFormat.class.getName());
    }//from ww w  .  j a v a 2 s  . c  o  m

    MRHelpers.translateMRConfToTez(mapStageConf);

    Configuration[] intermediateReduceStageConfs = null;
    if (iReduceStagesCount > 0 && numIReducer > 0) {
        intermediateReduceStageConfs = new JobConf[iReduceStagesCount];
        for (int i = 1; i <= iReduceStagesCount; ++i) {
            JobConf iReduceStageConf = new JobConf(conf);
            iReduceStageConf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, iReduceSleepTime);
            iReduceStageConf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, iReduceSleepCount);
            iReduceStageConf.setInt(MRJobConfig.NUM_REDUCES, numIReducer);
            iReduceStageConf.set(MRJobConfig.REDUCE_CLASS_ATTR, ISleepReducer.class.getName());
            iReduceStageConf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
            iReduceStageConf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
            iReduceStageConf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());

            MRHelpers.translateMRConfToTez(iReduceStageConf);
            intermediateReduceStageConfs[i - 1] = iReduceStageConf;
        }
    }

    Configuration finalReduceConf = null;
    if (numReducer > 0) {
        finalReduceConf = new JobConf(conf);
        finalReduceConf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, reduceSleepTime);
        finalReduceConf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, reduceSleepCount);
        finalReduceConf.setInt(MRJobConfig.NUM_REDUCES, numReducer);
        finalReduceConf.set(MRJobConfig.REDUCE_CLASS_ATTR, SleepReducer.class.getName());
        finalReduceConf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
        finalReduceConf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
        finalReduceConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, NullOutputFormat.class.getName());

        MRHelpers.translateMRConfToTez(finalReduceConf);
    }

    MRHelpers.configureMRApiUsage(mapStageConf);
    if (iReduceStagesCount > 0 && numIReducer > 0) {
        for (int i = 0; i < iReduceStagesCount; ++i) {
            MRHelpers.configureMRApiUsage(intermediateReduceStageConfs[i]);
        }
    }
    if (numReducer > 0) {
        MRHelpers.configureMRApiUsage(finalReduceConf);
    }

    DataSourceDescriptor dataSource = null;
    if (!generateSplitsInAM && writeSplitsToDFS) {

        LOG.info("Writing splits to DFS");
        dataSource = MRInputHelpers.configureMRInputWithLegacySplitGeneration(mapStageConf, remoteStagingDir,
                true);
    } else {
        dataSource = MRInputLegacy.createConfigBuilder(mapStageConf, SleepInputFormat.class)
                .generateSplitsInAM(generateSplitsInAM).build();
    }

    DAG dag = DAG.create("MRRSleepJob");
    String jarPath = ClassUtil.findContainingJar(getClass());
    if (jarPath == null) {
        throw new TezUncheckedException(
                "Could not find any jar containing" + " MRRSleepJob.class in the classpath");
    }
    Path remoteJarPath = remoteFs.makeQualified(new Path(remoteStagingDir, "dag_job.jar"));
    remoteFs.copyFromLocalFile(new Path(jarPath), remoteJarPath);
    FileStatus jarFileStatus = remoteFs.getFileStatus(remoteJarPath);

    TokenCache.obtainTokensForNamenodes(this.credentials, new Path[] { remoteJarPath }, mapStageConf);

    Map<String, LocalResource> commonLocalResources = new HashMap<String, LocalResource>();
    LocalResource dagJarLocalRsrc = LocalResource.newInstance(ConverterUtils.getYarnUrlFromPath(remoteJarPath),
            LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, jarFileStatus.getLen(),
            jarFileStatus.getModificationTime());
    commonLocalResources.put("dag_job.jar", dagJarLocalRsrc);

    List<Vertex> vertices = new ArrayList<Vertex>();

    UserPayload mapUserPayload = TezUtils.createUserPayloadFromConf(mapStageConf);
    int numTasks = generateSplitsInAM ? -1 : numMapper;

    Map<String, String> mapEnv = Maps.newHashMap();
    MRHelpers.updateEnvBasedOnMRTaskEnv(mapStageConf, mapEnv, true);
    Map<String, String> reduceEnv = Maps.newHashMap();
    MRHelpers.updateEnvBasedOnMRTaskEnv(mapStageConf, reduceEnv, false);

    Vertex mapVertex = Vertex.create("map",
            ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(mapUserPayload), numTasks,
            MRHelpers.getResourceForMRMapper(mapStageConf));
    mapVertex.addTaskLocalFiles(commonLocalResources).addDataSource("MRInput", dataSource)
            .setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRMapper(mapStageConf)).setTaskEnvironment(mapEnv);
    vertices.add(mapVertex);

    if (iReduceStagesCount > 0 && numIReducer > 0) {
        for (int i = 0; i < iReduceStagesCount; ++i) {
            Configuration iconf = intermediateReduceStageConfs[i];
            UserPayload iReduceUserPayload = TezUtils.createUserPayloadFromConf(iconf);
            Vertex ivertex = Vertex.create("ireduce" + (i + 1),
                    ProcessorDescriptor.create(ReduceProcessor.class.getName())
                            .setUserPayload(iReduceUserPayload),
                    numIReducer, MRHelpers.getResourceForMRReducer(intermediateReduceStageConfs[i]));
            ivertex.addTaskLocalFiles(commonLocalResources)
                    .setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRReducer(intermediateReduceStageConfs[i]))
                    .setTaskEnvironment(reduceEnv);
            vertices.add(ivertex);
        }
    }

    Vertex finalReduceVertex = null;
    if (numReducer > 0) {
        UserPayload reducePayload = TezUtils.createUserPayloadFromConf(finalReduceConf);
        finalReduceVertex = Vertex.create("reduce",
                ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(reducePayload),
                numReducer, MRHelpers.getResourceForMRReducer(finalReduceConf));
        finalReduceVertex.addTaskLocalFiles(commonLocalResources)
                .addDataSink("MROutput",
                        MROutputLegacy.createConfigBuilder(finalReduceConf, NullOutputFormat.class).build())
                .setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRReducer(finalReduceConf))
                .setTaskEnvironment(reduceEnv);
        vertices.add(finalReduceVertex);
    } else {
        // Map only job
        mapVertex.addDataSink("MROutput",
                MROutputLegacy.createConfigBuilder(mapStageConf, NullOutputFormat.class).build());
    }

    Map<String, String> partitionerConf = Maps.newHashMap();
    partitionerConf.put(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
            .newBuilder(IntWritable.class.getName(), IntWritable.class.getName(),
                    HashPartitioner.class.getName(), partitionerConf)
            .configureInput().useLegacyInput().done().build();

    for (int i = 0; i < vertices.size(); ++i) {
        dag.addVertex(vertices.get(i));
        if (i != 0) {
            dag.addEdge(
                    Edge.create(vertices.get(i - 1), vertices.get(i), edgeConf.createDefaultEdgeProperty()));
        }
    }

    return dag;
}

Example usage for org.apache.hadoop.mapreduce MRJobConfig OUTPUT_FORMAT_CLASS_ATTR

Introduction

Prototype

Usage