List of usage examples for org.apache.hadoop.mapred JobConf setOutputValueClass
public void setOutputValueClass(Class<?> theClass)
From source file:Business.DataJoin.java
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); JobConf job = new JobConf(conf, DataJoin.class); final File f = new File(MapReduceOne.class.getProtectionDomain().getCodeSource().getLocation().getPath()); String inFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/inFiles/"; String outFiles = f.getAbsolutePath().replace("/build/classes", "") + "/src/outFiles/OutputOne"; //use the arguments instead if provided. if (args.length > 1) { inFiles = args[1];// w w w. j av a 2 s . c o m outFiles = args[2]; } Path in = new Path(inFiles); Path out = new Path(outFiles); FileInputFormat.setInputPaths(job, in); FileOutputFormat.setOutputPath(job, out); job.setJobName("Data Join"); job.setMapperClass(MapClass.class); job.setReducerClass(ReduceClass.class); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TaggedWritable.class); job.set("mapred.textoutputformat.separator", ","); JobClient.runJob(job); return 0; }
From source file:ca.etsmtl.lasi.hbasewikipedialoader.HBaseWikipediaLoader.java
License:Apache License
/** * Sets up the actual job./* www. j a v a 2 s . c o m*/ * * @param conf * The current configuration. * @param args * The command line parameters. * @return The newly created job. * @throws IOException * When setting up the job fails. */ public static JobConf createSubmittableJob(HBaseConfiguration conf, String[] args) throws IOException { JobConf jobConf = new JobConf(conf, HBaseWikipediaLoader.class); jobConf.setJobName(NAME); // Stream stuff jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader"); jobConf.set("stream.recordreader.begin", "<page>"); jobConf.set("stream.recordreader.end", "</page>"); jobConf.setSpeculativeExecution(false); jobConf.setMapOutputKeyClass(ImmutableBytesWritable.class); jobConf.setMapOutputValueClass(BatchUpdate.class); jobConf.setMapperClass(Map.class); jobConf.setNumReduceTasks(0); jobConf.setInputFormat(StreamInputFormat.class); jobConf.setOutputFormat(TableOutputFormat.class); jobConf.set(TableOutputFormat.OUTPUT_TABLE, TABLE); jobConf.setOutputKeyClass(ImmutableBytesWritable.class); jobConf.setOutputValueClass(BatchUpdate.class); StreamInputFormat.setInputPaths(jobConf, new Path(args[0])); FileOutputFormat.setOutputPath(jobConf, new Path("/tmp/" + NAME + "-" + System.currentTimeMillis())); return jobConf; }
From source file:cascading.flow.FlowStep.java
License:Open Source License
protected JobConf getJobConf(JobConf parentConf) throws IOException { JobConf conf = parentConf == null ? new JobConf() : new JobConf(parentConf); // set values first so they can't break things downstream if (hasProperties()) { for (Map.Entry entry : getProperties().entrySet()) conf.set(entry.getKey().toString(), entry.getValue().toString()); }/*w ww. ja v a 2s .c o m*/ // disable warning conf.setBoolean("mapred.used.genericoptionsparser", true); conf.setJobName(getStepName()); conf.setOutputKeyClass(Tuple.class); conf.setOutputValueClass(Tuple.class); conf.setMapperClass(FlowMapper.class); conf.setReducerClass(FlowReducer.class); // set for use by the shuffling phase TupleSerialization.setSerializations(conf); initFromSources(conf); initFromSink(conf); initFromTraps(conf); if (sink.getScheme().getNumSinkParts() != 0) { // if no reducer, set num map tasks to control parts if (getGroup() != null) conf.setNumReduceTasks(sink.getScheme().getNumSinkParts()); else conf.setNumMapTasks(sink.getScheme().getNumSinkParts()); } conf.setOutputKeyComparatorClass(TupleComparator.class); if (getGroup() == null) { conf.setNumReduceTasks(0); // disable reducers } else { // must set map output defaults when performing a reduce conf.setMapOutputKeyClass(Tuple.class); conf.setMapOutputValueClass(Tuple.class); // handles the case the groupby sort should be reversed if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseTupleComparator.class); addComparators(conf, "cascading.group.comparator", getGroup().getGroupingSelectors()); if (getGroup().isGroupBy()) addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors()); if (!getGroup().isGroupBy()) { conf.setPartitionerClass(CoGroupingPartitioner.class); conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index conf.setMapOutputValueClass(IndexTuple.class); conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index conf.setOutputValueGroupingComparator(CoGroupingComparator.class); } if (getGroup().isSorted()) { conf.setPartitionerClass(GroupingPartitioner.class); conf.setMapOutputKeyClass(TuplePair.class); if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class); else conf.setOutputKeyComparatorClass(GroupingSortingComparator.class); // no need to supply a reverse comparator, only equality is checked conf.setOutputValueGroupingComparator(GroupingComparator.class); } } // perform last so init above will pass to tasks conf.setInt("cascading.flow.step.id", id); conf.set("cascading.flow.step", Util.serializeBase64(this)); return conf; }
From source file:cascading.flow.hadoop.HadoopFlowStep.java
License:Open Source License
public JobConf createInitializedConfig(FlowProcess<JobConf> flowProcess, JobConf parentConfig) { JobConf conf = parentConfig == null ? new JobConf() : HadoopUtil.copyJobConf(parentConfig); // disable warning conf.setBoolean("mapred.used.genericoptionsparser", true); conf.setJobName(getStepDisplayName(conf.getInt("cascading.display.id.truncate", Util.ID_LENGTH))); conf.setOutputKeyClass(Tuple.class); conf.setOutputValueClass(Tuple.class); conf.setMapRunnerClass(FlowMapper.class); conf.setReducerClass(FlowReducer.class); // set for use by the shuffling phase TupleSerialization.setSerializations(conf); initFromSources(flowProcess, conf);// w ww. j av a 2 s .co m initFromSink(flowProcess, conf); initFromTraps(flowProcess, conf); initFromStepConfigDef(conf); int numSinkParts = getSink().getScheme().getNumSinkParts(); if (numSinkParts != 0) { // if no reducer, set num map tasks to control parts if (getGroup() != null) conf.setNumReduceTasks(numSinkParts); else conf.setNumMapTasks(numSinkParts); } else if (getGroup() != null) { int gatherPartitions = conf.getNumReduceTasks(); if (gatherPartitions == 0) gatherPartitions = conf.getInt(FlowRuntimeProps.GATHER_PARTITIONS, 0); if (gatherPartitions == 0) throw new FlowException(getName(), "a default number of gather partitions must be set, see FlowRuntimeProps"); conf.setNumReduceTasks(gatherPartitions); } conf.setOutputKeyComparatorClass(TupleComparator.class); if (getGroup() == null) { conf.setNumReduceTasks(0); // disable reducers } else { // must set map output defaults when performing a reduce conf.setMapOutputKeyClass(Tuple.class); conf.setMapOutputValueClass(Tuple.class); conf.setPartitionerClass(GroupingPartitioner.class); // handles the case the groupby sort should be reversed if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseTupleComparator.class); addComparators(conf, "cascading.group.comparator", getGroup().getKeySelectors(), this, getGroup()); if (getGroup().isGroupBy()) addComparators(conf, "cascading.sort.comparator", getGroup().getSortingSelectors(), this, getGroup()); if (!getGroup().isGroupBy()) { conf.setPartitionerClass(CoGroupingPartitioner.class); conf.setMapOutputKeyClass(IndexTuple.class); // allows groups to be sorted by index conf.setMapOutputValueClass(IndexTuple.class); conf.setOutputKeyComparatorClass(IndexTupleCoGroupingComparator.class); // sorts by group, then by index conf.setOutputValueGroupingComparator(CoGroupingComparator.class); } if (getGroup().isSorted()) { conf.setPartitionerClass(GroupingSortingPartitioner.class); conf.setMapOutputKeyClass(TuplePair.class); if (getGroup().isSortReversed()) conf.setOutputKeyComparatorClass(ReverseGroupingSortingComparator.class); else conf.setOutputKeyComparatorClass(GroupingSortingComparator.class); // no need to supply a reverse comparator, only equality is checked conf.setOutputValueGroupingComparator(GroupingComparator.class); } } // perform last so init above will pass to tasks String versionString = Version.getRelease(); if (versionString != null) conf.set("cascading.version", versionString); conf.set(CASCADING_FLOW_STEP_ID, getID()); conf.set("cascading.flow.step.num", Integer.toString(getOrdinal())); HadoopUtil.setIsInflow(conf); Iterator<FlowNode> iterator = getFlowNodeGraph().getTopologicalIterator(); String mapState = pack(iterator.next(), conf); String reduceState = pack(iterator.hasNext() ? iterator.next() : null, conf); // hadoop 20.2 doesn't like dist cache when using local mode int maxSize = Short.MAX_VALUE; int length = mapState.length() + reduceState.length(); if (isHadoopLocalMode(conf) || length < maxSize) // seems safe { conf.set("cascading.flow.step.node.map", mapState); if (!Util.isEmpty(reduceState)) conf.set("cascading.flow.step.node.reduce", reduceState); } else { conf.set("cascading.flow.step.node.map.path", HadoopMRUtil.writeStateToDistCache(conf, getID(), "map", mapState)); if (!Util.isEmpty(reduceState)) conf.set("cascading.flow.step.node.reduce.path", HadoopMRUtil.writeStateToDistCache(conf, getID(), "reduce", reduceState)); } return conf; }
From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java
License:Open Source License
@Test public void testFlow() throws IOException { getPlatform().copyFromLocal(inputFileApache); JobConf defaultConf = (JobConf) ((BaseHadoopPlatform) getPlatform()).getConfiguration(); JobConf conf = new JobConf(defaultConf); conf.setJobName("mrflow"); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputFileApache)); String outputPath = getOutputPath("flowTest"); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); Flow flow = new MapReduceFlow("mrflow", conf, true); validateLength(new Hfs(new TextLine(), inputFileApache).openForRead(new HadoopFlowProcess(defaultConf)), 10);// w w w . j a v a2s . c o m flow.complete(); validateLength(new Hfs(new TextLine(), outputPath).openForRead(new HadoopFlowProcess(defaultConf)), 10); }
From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java
License:Open Source License
@Test public void testCascade() throws IOException { getPlatform().copyFromLocal(inputFileApache); // Setup two standard cascading flows that will generate the input for the first MapReduceFlow Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false)); String sinkPath4 = getOutputPath("flow4"); Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath4, true), SinkMode.REPLACE); Flow firstFlow = getPlatform().getFlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow")); String sinkPath5 = getOutputPath("flow5"); Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath5, true), SinkMode.REPLACE); Flow secondFlow = getPlatform().getFlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow")); JobConf defaultConf = HadoopPlanner.createJobConf(getProperties()); JobConf firstConf = new JobConf(defaultConf); firstConf.setJobName("first-mr"); firstConf.setOutputKeyClass(LongWritable.class); firstConf.setOutputValueClass(Text.class); firstConf.setMapperClass(IdentityMapper.class); firstConf.setReducerClass(IdentityReducer.class); firstConf.setInputFormat(TextInputFormat.class); firstConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(firstConf, new Path(remove(sinkPath5, true))); String sinkPath1 = getOutputPath("flow1"); FileOutputFormat.setOutputPath(firstConf, new Path(remove(sinkPath1, true))); Flow firstMR = new MapReduceFlow(firstConf, true); JobConf secondConf = new JobConf(defaultConf); secondConf.setJobName("second-mr"); secondConf.setOutputKeyClass(LongWritable.class); secondConf.setOutputValueClass(Text.class); secondConf.setMapperClass(IdentityMapper.class); secondConf.setReducerClass(IdentityReducer.class); secondConf.setInputFormat(TextInputFormat.class); secondConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(secondConf, new Path(remove(sinkPath1, true))); String sinkPath2 = getOutputPath("flow2"); FileOutputFormat.setOutputPath(secondConf, new Path(remove(sinkPath2, true))); Flow secondMR = new MapReduceFlow(secondConf, true); Job job = new Job(defaultConf); job.setJobName("third-mr"); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class); job.setReducerClass(org.apache.hadoop.mapreduce.Reducer.class); job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class); job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); job.getConfiguration().set("mapred.mapper.new-api", "true"); job.getConfiguration().set("mapred.reducer.new-api", "true"); org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(remove(sinkPath2, true))); String sinkPath3 = getOutputPath("flow3"); org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job, new Path(remove(sinkPath3, true))); Flow thirdMR = new MapReduceFlow(new JobConf(job.getConfiguration()), true); CascadeConnector cascadeConnector = new CascadeConnector(); // pass out of order Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR); cascade.complete();//from www .jav a 2 s . c om validateLength(new Hfs(new TextLine(), sinkPath3).openForRead(new HadoopFlowProcess(defaultConf)), 10); }
From source file:cascading.flow.MapReduceFlowTest.java
License:Open Source License
public void testFlow() throws IOException { if (!new File(inputFileApache).exists()) fail("data file not found"); copyFromLocal(inputFileApache);//from ww w . j av a 2 s . c o m JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties()); JobConf conf = new JobConf(defaultConf); conf.setJobName("mrflow"); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputFileApache)); FileOutputFormat.setOutputPath(conf, new Path(outputPath1)); Flow flow = new MapReduceFlow("mrflow", conf, true); validateLength(flow.openSource(), 10); flow.complete(); validateLength(flow.openSink(), 10); }
From source file:cascading.flow.MapReduceFlowTest.java
License:Open Source License
public void testCascade() throws IOException { if (!new File(inputFileApache).exists()) fail("data file not found"); copyFromLocal(inputFileApache);// www.ja v a 2 s . c o m // Setup two standard cascading flows that will generate the input for the first MapReduceFlow Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false)); Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath4, true), true); Flow firstFlow = new FlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow")); Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath5, true), true); Flow secondFlow = new FlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow")); JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties()); JobConf firstConf = new JobConf(defaultConf); firstConf.setJobName("first-mr"); firstConf.setOutputKeyClass(LongWritable.class); firstConf.setOutputValueClass(Text.class); firstConf.setMapperClass(IdentityMapper.class); firstConf.setReducerClass(IdentityReducer.class); firstConf.setInputFormat(TextInputFormat.class); firstConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(firstConf, new Path(remove(outputPath5, true))); FileOutputFormat.setOutputPath(firstConf, new Path(remove(outputPath1, true))); Flow firstMR = new MapReduceFlow(firstConf, true); JobConf secondConf = new JobConf(defaultConf); secondConf.setJobName("second-mr"); secondConf.setOutputKeyClass(LongWritable.class); secondConf.setOutputValueClass(Text.class); secondConf.setMapperClass(IdentityMapper.class); secondConf.setReducerClass(IdentityReducer.class); secondConf.setInputFormat(TextInputFormat.class); secondConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(secondConf, new Path(remove(outputPath1, true))); FileOutputFormat.setOutputPath(secondConf, new Path(remove(outputPath2, true))); Flow secondMR = new MapReduceFlow(secondConf, true); JobConf thirdConf = new JobConf(defaultConf); thirdConf.setJobName("third-mr"); thirdConf.setOutputKeyClass(LongWritable.class); thirdConf.setOutputValueClass(Text.class); thirdConf.setMapperClass(IdentityMapper.class); thirdConf.setReducerClass(IdentityReducer.class); thirdConf.setInputFormat(TextInputFormat.class); thirdConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(thirdConf, new Path(remove(outputPath2, true))); FileOutputFormat.setOutputPath(thirdConf, new Path(remove(outputPath3, true))); Flow thirdMR = new MapReduceFlow(thirdConf, true); CascadeConnector cascadeConnector = new CascadeConnector(); // pass out of order Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR); // cascade.writeDOT( "mrcascade.dot" ); cascade.complete(); validateLength(thirdMR.openSink(), 10); }
From source file:cascading.hive.ORCFile.java
License:Apache License
@Override public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setOutputKeyClass(NullWritable.class); conf.setOutputValueClass(OrcSerde.OrcSerdeRow.class); conf.setOutputFormat(OrcOutputFormat.class); }
From source file:cascading.hive.RCFile.java
License:Apache License
@Override public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setOutputKeyClass(WritableComparable.class); conf.setOutputValueClass(BytesRefArrayWritable.class); conf.setOutputFormat(RCFileOutputFormat.class); conf.set(HiveProps.HIVE_COLUMN_NUMBER, String.valueOf(getSinkFields().size())); }