Example usage for org.apache.hadoop.mapred JobConf setOutputValueClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputValueClass.

Prototype

public void setOutputValueClass(Class<?> theClass)

Source Link

Document

Set the value class for job outputs.

Usage

From source file:Business.DataJoin.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    JobConf job = new JobConf(conf, DataJoin.class);

    final File f = new File(MapReduceOne.class.getProtectionDomain().getCodeSource().getLocation().getPath());
    String inFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/inFiles/";
    String outFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outFiles/OutputOne";
    //use the arguments instead if provided.
    if (args.length > 1) {
        inFiles = args[1];// w w  w.  j av  a  2 s  .  c o  m
        outFiles = args[2];
    }
    Path in = new Path(inFiles);
    Path out = new Path(outFiles);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("Data Join");
    job.setMapperClass(MapClass.class);
    job.setReducerClass(ReduceClass.class);

    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TaggedWritable.class);
    job.set("mapred.textoutputformat.separator", ",");

    JobClient.runJob(job);
    return 0;
}

From source file:ca.etsmtl.lasi.hbasewikipedialoader.HBaseWikipediaLoader.java

License:Apache License

/**
 * Sets up the actual job./*  www. j a v a 2 s  .  c o  m*/
 * 
 * @param conf
 *          The current configuration.
 * @param args
 *          The command line parameters.
 * @return The newly created job.
 * @throws IOException
 *           When setting up the job fails.
 */
public static JobConf createSubmittableJob(HBaseConfiguration conf, String[] args) throws IOException {
    JobConf jobConf = new JobConf(conf, HBaseWikipediaLoader.class);
    jobConf.setJobName(NAME);

    // Stream stuff
    jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader");
    jobConf.set("stream.recordreader.begin", "<page>");
    jobConf.set("stream.recordreader.end", "</page>");

    jobConf.setSpeculativeExecution(false);

    jobConf.setMapOutputKeyClass(ImmutableBytesWritable.class);
    jobConf.setMapOutputValueClass(BatchUpdate.class);

    jobConf.setMapperClass(Map.class);

    jobConf.setNumReduceTasks(0);

    jobConf.setInputFormat(StreamInputFormat.class);
    jobConf.setOutputFormat(TableOutputFormat.class);
    jobConf.set(TableOutputFormat.OUTPUT_TABLE, TABLE);
    jobConf.setOutputKeyClass(ImmutableBytesWritable.class);
    jobConf.setOutputValueClass(BatchUpdate.class);

    StreamInputFormat.setInputPaths(jobConf, new Path(args[0]));
    FileOutputFormat.setOutputPath(jobConf, new Path("/tmp/" + NAME + "-" + System.currentTimeMillis()));

    return jobConf;

}

From source file:cascading.flow.FlowStep.java

License:Open Source License

protected JobConf getJobConf(JobConf parentConf) throws IOException {
    JobConf conf = parentConf == null ? new JobConf() : new JobConf(parentConf);

    // set values first so they can't break things downstream
    if (hasProperties()) {
        for (Map.Entry entry : getProperties().entrySet())
            conf.set(entry.getKey().toString(), entry.getValue().toString());
    }/*w  ww.  ja v  a  2s .c o  m*/

    // disable warning
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    conf.setJobName(getStepName());

    conf.setOutputKeyClass(Tuple.class);
    conf.setOutputValueClass(Tuple.class);

    conf.setMapperClass(FlowMapper.class);
    conf.setReducerClass(FlowReducer.class);

    // set for use by the shuffling phase
    TupleSerialization.setSerializations(conf);

    initFromSources(conf);

    initFromSink(conf);

    initFromTraps(conf);

    if (sink.getScheme().getNumSinkParts() != 0) {
        // if no reducer, set num map tasks to control parts
        if (getGroup() != null)
            conf.setNumReduceTasks(sink.getScheme().getNumSinkParts());
        else
            conf.setNumMapTasks(sink.getScheme().getNumSinkParts());
    }

    conf.setOutputKeyComparatorClass(TupleComparator.class);

    if (getGroup() == null) {
        conf.setNumReduceTasks(0); // disable reducers
    } else {
        // must set map output defaults when performing a reduce
        conf.setMapOutputKeyClass(Tuple.class);
        conf.setMapOutputValueClass(Tuple.class);

        // handles the case the groupby sort should be reversed
        if (getGroup().isSortReversed())
            conf.setOutputKeyComparatorClass(ReverseTupleComparator.class);

        addComparators(conf, "cascading.group.comparator", getGroup().getGroupingSelectors());

        if (getGroup().isGroupBy())
            addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors());

        if (!getGroup().isGroupBy()) {
            conf.setPartitionerClass(CoGroupingPartitioner.class);
            conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index
            conf.setMapOutputValueClass(IndexTuple.class);
            conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index
            conf.setOutputValueGroupingComparator(CoGroupingComparator.class);
        }

        if (getGroup().isSorted()) {
            conf.setPartitionerClass(GroupingPartitioner.class);
            conf.setMapOutputKeyClass(TuplePair.class);

            if (getGroup().isSortReversed())
                conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class);
            else
                conf.setOutputKeyComparatorClass(GroupingSortingComparator.class);

            // no need to supply a reverse comparator, only equality is checked
            conf.setOutputValueGroupingComparator(GroupingComparator.class);
        }
    }

    // perform last so init above will pass to tasks
    conf.setInt("cascading.flow.step.id", id);
    conf.set("cascading.flow.step", Util.serializeBase64(this));

    return conf;
}

From source file:cascading.flow.hadoop.HadoopFlowStep.java

License:Open Source License

public JobConf createInitializedConfig(FlowProcess<JobConf> flowProcess, JobConf parentConfig) {
    JobConf conf = parentConfig == null ? new JobConf() : HadoopUtil.copyJobConf(parentConfig);

    // disable warning
    conf.setBoolean("mapred.used.genericoptionsparser", true);

    conf.setJobName(getStepDisplayName(conf.getInt("cascading.display.id.truncate", Util.ID_LENGTH)));

    conf.setOutputKeyClass(Tuple.class);
    conf.setOutputValueClass(Tuple.class);

    conf.setMapRunnerClass(FlowMapper.class);
    conf.setReducerClass(FlowReducer.class);

    // set for use by the shuffling phase
    TupleSerialization.setSerializations(conf);

    initFromSources(flowProcess, conf);//  w ww. j  av  a  2 s  .co m

    initFromSink(flowProcess, conf);

    initFromTraps(flowProcess, conf);

    initFromStepConfigDef(conf);

    int numSinkParts = getSink().getScheme().getNumSinkParts();

    if (numSinkParts != 0) {
        // if no reducer, set num map tasks to control parts
        if (getGroup() != null)
            conf.setNumReduceTasks(numSinkParts);
        else
            conf.setNumMapTasks(numSinkParts);
    } else if (getGroup() != null) {
        int gatherPartitions = conf.getNumReduceTasks();

        if (gatherPartitions == 0)
            gatherPartitions = conf.getInt(FlowRuntimeProps.GATHER_PARTITIONS, 0);

        if (gatherPartitions == 0)
            throw new FlowException(getName(),
                    "a default number of gather partitions must be set, see FlowRuntimeProps");

        conf.setNumReduceTasks(gatherPartitions);
    }

    conf.setOutputKeyComparatorClass(TupleComparator.class);

    if (getGroup() == null) {
        conf.setNumReduceTasks(0); // disable reducers
    } else {
        // must set map output defaults when performing a reduce
        conf.setMapOutputKeyClass(Tuple.class);
        conf.setMapOutputValueClass(Tuple.class);
        conf.setPartitionerClass(GroupingPartitioner.class);

        // handles the case the groupby sort should be reversed
        if (getGroup().isSortReversed())
            conf.setOutputKeyComparatorClass(ReverseTupleComparator.class);

        addComparators(conf, "cascading.group.comparator", getGroup().getKeySelectors(), this, getGroup());

        if (getGroup().isGroupBy())
            addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors(), this,
                    getGroup());

        if (!getGroup().isGroupBy()) {
            conf.setPartitionerClass(CoGroupingPartitioner.class);
            conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index
            conf.setMapOutputValueClass(IndexTuple.class);
            conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index
            conf.setOutputValueGroupingComparator(CoGroupingComparator.class);
        }

        if (getGroup().isSorted()) {
            conf.setPartitionerClass(GroupingSortingPartitioner.class);
            conf.setMapOutputKeyClass(TuplePair.class);

            if (getGroup().isSortReversed())
                conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class);
            else
                conf.setOutputKeyComparatorClass(GroupingSortingComparator.class);

            // no need to supply a reverse comparator, only equality is checked
            conf.setOutputValueGroupingComparator(GroupingComparator.class);
        }
    }

    // perform last so init above will pass to tasks
    String versionString = Version.getRelease();

    if (versionString != null)
        conf.set("cascading.version", versionString);

    conf.set(CASCADING_FLOW_STEP_ID, getID());
    conf.set("cascading.flow.step.num", Integer.toString(getOrdinal()));

    HadoopUtil.setIsInflow(conf);

    Iterator<FlowNode> iterator = getFlowNodeGraph().getTopologicalIterator();

    String mapState = pack(iterator.next(), conf);
    String reduceState = pack(iterator.hasNext() ? iterator.next() : null, conf);

    // hadoop 20.2 doesn't like dist cache when using local mode
    int maxSize = Short.MAX_VALUE;

    int length = mapState.length() + reduceState.length();

    if (isHadoopLocalMode(conf) || length < maxSize) // seems safe
    {
        conf.set("cascading.flow.step.node.map", mapState);

        if (!Util.isEmpty(reduceState))
            conf.set("cascading.flow.step.node.reduce", reduceState);
    } else {
        conf.set("cascading.flow.step.node.map.path",
                HadoopMRUtil.writeStateToDistCache(conf, getID(), "map", mapState));

        if (!Util.isEmpty(reduceState))
            conf.set("cascading.flow.step.node.reduce.path",
                    HadoopMRUtil.writeStateToDistCache(conf, getID(), "reduce", reduceState));
    }

    return conf;
}

From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java

License:Open Source License

@Test
public void testFlow() throws IOException {
    getPlatform().copyFromLocal(inputFileApache);

    JobConf defaultConf = (JobConf) ((BaseHadoopPlatform) getPlatform()).getConfiguration();

    JobConf conf = new JobConf(defaultConf);
    conf.setJobName("mrflow");

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(inputFileApache));

    String outputPath = getOutputPath("flowTest");
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    Flow flow = new MapReduceFlow("mrflow", conf, true);

    validateLength(new Hfs(new TextLine(), inputFileApache).openForRead(new HadoopFlowProcess(defaultConf)),
            10);// w w  w  .  j  a v a2s  .  c o m

    flow.complete();

    validateLength(new Hfs(new TextLine(), outputPath).openForRead(new HadoopFlowProcess(defaultConf)), 10);
}

From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java

License:Open Source License

@Test
public void testCascade() throws IOException {
    getPlatform().copyFromLocal(inputFileApache);

    // Setup two standard cascading flows that will generate the input for the first MapReduceFlow
    Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false));
    String sinkPath4 = getOutputPath("flow4");
    Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath4, true), SinkMode.REPLACE);
    Flow firstFlow = getPlatform().getFlowConnector(getProperties()).connect(source1, sink1,
            new Pipe("first-flow"));

    String sinkPath5 = getOutputPath("flow5");
    Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath5, true), SinkMode.REPLACE);
    Flow secondFlow = getPlatform().getFlowConnector(getProperties()).connect(sink1, sink2,
            new Pipe("second-flow"));

    JobConf defaultConf = HadoopPlanner.createJobConf(getProperties());

    JobConf firstConf = new JobConf(defaultConf);
    firstConf.setJobName("first-mr");

    firstConf.setOutputKeyClass(LongWritable.class);
    firstConf.setOutputValueClass(Text.class);

    firstConf.setMapperClass(IdentityMapper.class);
    firstConf.setReducerClass(IdentityReducer.class);

    firstConf.setInputFormat(TextInputFormat.class);
    firstConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(firstConf, new Path(remove(sinkPath5, true)));
    String sinkPath1 = getOutputPath("flow1");
    FileOutputFormat.setOutputPath(firstConf, new Path(remove(sinkPath1, true)));

    Flow firstMR = new MapReduceFlow(firstConf, true);

    JobConf secondConf = new JobConf(defaultConf);
    secondConf.setJobName("second-mr");

    secondConf.setOutputKeyClass(LongWritable.class);
    secondConf.setOutputValueClass(Text.class);

    secondConf.setMapperClass(IdentityMapper.class);
    secondConf.setReducerClass(IdentityReducer.class);

    secondConf.setInputFormat(TextInputFormat.class);
    secondConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(secondConf, new Path(remove(sinkPath1, true)));
    String sinkPath2 = getOutputPath("flow2");
    FileOutputFormat.setOutputPath(secondConf, new Path(remove(sinkPath2, true)));

    Flow secondMR = new MapReduceFlow(secondConf, true);

    Job job = new Job(defaultConf);
    job.setJobName("third-mr");

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setReducerClass(org.apache.hadoop.mapreduce.Reducer.class);

    job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class);
    job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);
    job.getConfiguration().set("mapred.mapper.new-api", "true");
    job.getConfiguration().set("mapred.reducer.new-api", "true");

    org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(remove(sinkPath2, true)));
    String sinkPath3 = getOutputPath("flow3");
    org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job,
            new Path(remove(sinkPath3, true)));

    Flow thirdMR = new MapReduceFlow(new JobConf(job.getConfiguration()), true);

    CascadeConnector cascadeConnector = new CascadeConnector();

    // pass out of order
    Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR);

    cascade.complete();//from www .jav  a 2  s . c om

    validateLength(new Hfs(new TextLine(), sinkPath3).openForRead(new HadoopFlowProcess(defaultConf)), 10);
}

From source file:cascading.flow.MapReduceFlowTest.java

License:Open Source License

public void testFlow() throws IOException {
    if (!new File(inputFileApache).exists())
        fail("data file not found");

    copyFromLocal(inputFileApache);//from ww w .  j  av a 2 s .  c o  m

    JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties());

    JobConf conf = new JobConf(defaultConf);
    conf.setJobName("mrflow");

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(inputFileApache));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath1));

    Flow flow = new MapReduceFlow("mrflow", conf, true);

    validateLength(flow.openSource(), 10);

    flow.complete();

    validateLength(flow.openSink(), 10);
}

From source file:cascading.flow.MapReduceFlowTest.java

License:Open Source License

public void testCascade() throws IOException {
    if (!new File(inputFileApache).exists())
        fail("data file not found");

    copyFromLocal(inputFileApache);// www.ja v a  2  s  .  c o m

    // Setup two standard cascading flows that will generate the input for the first MapReduceFlow
    Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false));
    Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath4, true), true);
    Flow firstFlow = new FlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow"));

    Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath5, true), true);
    Flow secondFlow = new FlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow"));

    JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties());

    JobConf firstConf = new JobConf(defaultConf);
    firstConf.setJobName("first-mr");

    firstConf.setOutputKeyClass(LongWritable.class);
    firstConf.setOutputValueClass(Text.class);

    firstConf.setMapperClass(IdentityMapper.class);
    firstConf.setReducerClass(IdentityReducer.class);

    firstConf.setInputFormat(TextInputFormat.class);
    firstConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(firstConf, new Path(remove(outputPath5, true)));
    FileOutputFormat.setOutputPath(firstConf, new Path(remove(outputPath1, true)));

    Flow firstMR = new MapReduceFlow(firstConf, true);

    JobConf secondConf = new JobConf(defaultConf);
    secondConf.setJobName("second-mr");

    secondConf.setOutputKeyClass(LongWritable.class);
    secondConf.setOutputValueClass(Text.class);

    secondConf.setMapperClass(IdentityMapper.class);
    secondConf.setReducerClass(IdentityReducer.class);

    secondConf.setInputFormat(TextInputFormat.class);
    secondConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(secondConf, new Path(remove(outputPath1, true)));
    FileOutputFormat.setOutputPath(secondConf, new Path(remove(outputPath2, true)));

    Flow secondMR = new MapReduceFlow(secondConf, true);

    JobConf thirdConf = new JobConf(defaultConf);
    thirdConf.setJobName("third-mr");

    thirdConf.setOutputKeyClass(LongWritable.class);
    thirdConf.setOutputValueClass(Text.class);

    thirdConf.setMapperClass(IdentityMapper.class);
    thirdConf.setReducerClass(IdentityReducer.class);

    thirdConf.setInputFormat(TextInputFormat.class);
    thirdConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(thirdConf, new Path(remove(outputPath2, true)));
    FileOutputFormat.setOutputPath(thirdConf, new Path(remove(outputPath3, true)));

    Flow thirdMR = new MapReduceFlow(thirdConf, true);

    CascadeConnector cascadeConnector = new CascadeConnector();

    // pass out of order
    Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR);

    //    cascade.writeDOT( "mrcascade.dot" );

    cascade.complete();

    validateLength(thirdMR.openSink(), 10);
}

From source file:cascading.hive.ORCFile.java

License:Apache License

@Override
public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf conf) {
    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(OrcSerde.OrcSerdeRow.class);
    conf.setOutputFormat(OrcOutputFormat.class);
}

From source file:cascading.hive.RCFile.java

License:Apache License

@Override
public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf conf) {
    conf.setOutputKeyClass(WritableComparable.class);
    conf.setOutputValueClass(BytesRefArrayWritable.class);
    conf.setOutputFormat(RCFileOutputFormat.class);
    conf.set(HiveProps.HIVE_COLUMN_NUMBER, String.valueOf(getSinkFields().size()));
}