List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat
public void setInputFormat(Class<? extends InputFormat> theClass)
From source file:ca.etsmtl.lasi.hbasewikipedialoader.HBaseWikipediaLoader.java
License:Apache License
/** * Sets up the actual job./*from w w w. ja v a2 s . co m*/ * * @param conf * The current configuration. * @param args * The command line parameters. * @return The newly created job. * @throws IOException * When setting up the job fails. */ public static JobConf createSubmittableJob(HBaseConfiguration conf, String[] args) throws IOException { JobConf jobConf = new JobConf(conf, HBaseWikipediaLoader.class); jobConf.setJobName(NAME); // Stream stuff jobConf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader"); jobConf.set("stream.recordreader.begin", "<page>"); jobConf.set("stream.recordreader.end", "</page>"); jobConf.setSpeculativeExecution(false); jobConf.setMapOutputKeyClass(ImmutableBytesWritable.class); jobConf.setMapOutputValueClass(BatchUpdate.class); jobConf.setMapperClass(Map.class); jobConf.setNumReduceTasks(0); jobConf.setInputFormat(StreamInputFormat.class); jobConf.setOutputFormat(TableOutputFormat.class); jobConf.set(TableOutputFormat.OUTPUT_TABLE, TABLE); jobConf.setOutputKeyClass(ImmutableBytesWritable.class); jobConf.setOutputValueClass(BatchUpdate.class); StreamInputFormat.setInputPaths(jobConf, new Path(args[0])); FileOutputFormat.setOutputPath(jobConf, new Path("/tmp/" + NAME + "-" + System.currentTimeMillis())); return jobConf; }
From source file:cascading.avro.AvroScheme.java
License:Apache License
/** * sourceConfInit is called by cascading to set up the sources. This happens on the client side before the * job is distributed.// w w w . j a va2 s . com * There is a check for the presence of a schema and if none has been provided the data is peeked at to get a schema. * After the schema check the conf object is given the options that Avro needs. * * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically. * @param tap The cascading Tap object. Should be passed in by cascading automatically. * @param conf The Hadoop JobConf object. This is passed in by cascading automatically. * @throws RuntimeException If no schema is present this halts the entire process. */ @Override public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { retrieveSourceFields(flowProcess, tap); // Set the input schema and input class conf.set(AvroJob.INPUT_SCHEMA, schema.toString()); conf.setInputFormat(AvroInputFormat.class); // add AvroSerialization to io.serializations addAvroSerializations(conf); }
From source file:cascading.dbmigrate.hadoop.DBInputFormat.java
License:Open Source License
public static void setInput(JobConf job, int numChunks, String databaseDriver, String username, String pwd, String dburl, String tableName, String pkColumn, Long minId, Long maxId, String... columnNames) { job.setInputFormat(DBInputFormat.class); DBConfiguration dbConf = new DBConfiguration(job); dbConf.configureDB(databaseDriver, dburl, username, pwd); if (minId != null) { dbConf.setMinId(minId.longValue()); }//from w w w .j a va 2s. c o m if (maxId != null) { dbConf.setMaxId(maxId.longValue()); } dbConf.setInputTableName(tableName); dbConf.setInputColumnNames(columnNames); dbConf.setPrimaryKeyColumn(pkColumn); dbConf.setNumChunks(numChunks); }
From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java
License:Open Source License
@Test public void testFlow() throws IOException { getPlatform().copyFromLocal(inputFileApache); JobConf defaultConf = (JobConf) ((BaseHadoopPlatform) getPlatform()).getConfiguration(); JobConf conf = new JobConf(defaultConf); conf.setJobName("mrflow"); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputFileApache)); String outputPath = getOutputPath("flowTest"); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); Flow flow = new MapReduceFlow("mrflow", conf, true); validateLength(new Hfs(new TextLine(), inputFileApache).openForRead(new HadoopFlowProcess(defaultConf)), 10);/*from ww w .ja v a 2 s.c om*/ flow.complete(); validateLength(new Hfs(new TextLine(), outputPath).openForRead(new HadoopFlowProcess(defaultConf)), 10); }
From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java
License:Open Source License
@Test public void testCascade() throws IOException { getPlatform().copyFromLocal(inputFileApache); // Setup two standard cascading flows that will generate the input for the first MapReduceFlow Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false)); String sinkPath4 = getOutputPath("flow4"); Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath4, true), SinkMode.REPLACE); Flow firstFlow = getPlatform().getFlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow")); String sinkPath5 = getOutputPath("flow5"); Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath5, true), SinkMode.REPLACE); Flow secondFlow = getPlatform().getFlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow")); JobConf defaultConf = HadoopPlanner.createJobConf(getProperties()); JobConf firstConf = new JobConf(defaultConf); firstConf.setJobName("first-mr"); firstConf.setOutputKeyClass(LongWritable.class); firstConf.setOutputValueClass(Text.class); firstConf.setMapperClass(IdentityMapper.class); firstConf.setReducerClass(IdentityReducer.class); firstConf.setInputFormat(TextInputFormat.class); firstConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(firstConf, new Path(remove(sinkPath5, true))); String sinkPath1 = getOutputPath("flow1"); FileOutputFormat.setOutputPath(firstConf, new Path(remove(sinkPath1, true))); Flow firstMR = new MapReduceFlow(firstConf, true); JobConf secondConf = new JobConf(defaultConf); secondConf.setJobName("second-mr"); secondConf.setOutputKeyClass(LongWritable.class); secondConf.setOutputValueClass(Text.class); secondConf.setMapperClass(IdentityMapper.class); secondConf.setReducerClass(IdentityReducer.class); secondConf.setInputFormat(TextInputFormat.class); secondConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(secondConf, new Path(remove(sinkPath1, true))); String sinkPath2 = getOutputPath("flow2"); FileOutputFormat.setOutputPath(secondConf, new Path(remove(sinkPath2, true))); Flow secondMR = new MapReduceFlow(secondConf, true); Job job = new Job(defaultConf); job.setJobName("third-mr"); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class); job.setReducerClass(org.apache.hadoop.mapreduce.Reducer.class); job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class); job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); job.getConfiguration().set("mapred.mapper.new-api", "true"); job.getConfiguration().set("mapred.reducer.new-api", "true"); org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(remove(sinkPath2, true))); String sinkPath3 = getOutputPath("flow3"); org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job, new Path(remove(sinkPath3, true))); Flow thirdMR = new MapReduceFlow(new JobConf(job.getConfiguration()), true); CascadeConnector cascadeConnector = new CascadeConnector(); // pass out of order Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR); cascade.complete();// w ww . ja v a 2 s .c o m validateLength(new Hfs(new TextLine(), sinkPath3).openForRead(new HadoopFlowProcess(defaultConf)), 10); }
From source file:cascading.flow.MapReduceFlowTest.java
License:Open Source License
public void testFlow() throws IOException { if (!new File(inputFileApache).exists()) fail("data file not found"); copyFromLocal(inputFileApache);//from w w w.ja va 2s . co m JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties()); JobConf conf = new JobConf(defaultConf); conf.setJobName("mrflow"); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputFileApache)); FileOutputFormat.setOutputPath(conf, new Path(outputPath1)); Flow flow = new MapReduceFlow("mrflow", conf, true); validateLength(flow.openSource(), 10); flow.complete(); validateLength(flow.openSink(), 10); }
From source file:cascading.flow.MapReduceFlowTest.java
License:Open Source License
public void testCascade() throws IOException { if (!new File(inputFileApache).exists()) fail("data file not found"); copyFromLocal(inputFileApache);/* w ww. ja v a 2 s . c om*/ // Setup two standard cascading flows that will generate the input for the first MapReduceFlow Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false)); Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath4, true), true); Flow firstFlow = new FlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow")); Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(outputPath5, true), true); Flow secondFlow = new FlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow")); JobConf defaultConf = MultiMapReducePlanner.getJobConf(getProperties()); JobConf firstConf = new JobConf(defaultConf); firstConf.setJobName("first-mr"); firstConf.setOutputKeyClass(LongWritable.class); firstConf.setOutputValueClass(Text.class); firstConf.setMapperClass(IdentityMapper.class); firstConf.setReducerClass(IdentityReducer.class); firstConf.setInputFormat(TextInputFormat.class); firstConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(firstConf, new Path(remove(outputPath5, true))); FileOutputFormat.setOutputPath(firstConf, new Path(remove(outputPath1, true))); Flow firstMR = new MapReduceFlow(firstConf, true); JobConf secondConf = new JobConf(defaultConf); secondConf.setJobName("second-mr"); secondConf.setOutputKeyClass(LongWritable.class); secondConf.setOutputValueClass(Text.class); secondConf.setMapperClass(IdentityMapper.class); secondConf.setReducerClass(IdentityReducer.class); secondConf.setInputFormat(TextInputFormat.class); secondConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(secondConf, new Path(remove(outputPath1, true))); FileOutputFormat.setOutputPath(secondConf, new Path(remove(outputPath2, true))); Flow secondMR = new MapReduceFlow(secondConf, true); JobConf thirdConf = new JobConf(defaultConf); thirdConf.setJobName("third-mr"); thirdConf.setOutputKeyClass(LongWritable.class); thirdConf.setOutputValueClass(Text.class); thirdConf.setMapperClass(IdentityMapper.class); thirdConf.setReducerClass(IdentityReducer.class); thirdConf.setInputFormat(TextInputFormat.class); thirdConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(thirdConf, new Path(remove(outputPath2, true))); FileOutputFormat.setOutputPath(thirdConf, new Path(remove(outputPath3, true))); Flow thirdMR = new MapReduceFlow(thirdConf, true); CascadeConnector cascadeConnector = new CascadeConnector(); // pass out of order Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR); // cascade.writeDOT( "mrcascade.dot" ); cascade.complete(); validateLength(thirdMR.openSink(), 10); }
From source file:cascading.hcatalog.HCatScheme.java
License:Apache License
@Override public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setInputFormat(inputFormat); createSerDe(conf);//from ww w.ja va 2s. co m }
From source file:cascading.hive.ORCFile.java
License:Apache License
@Override public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setInputFormat(OrcInputFormat.class); if (selectedColIds != null) { conf.set(HiveProps.HIVE_SELECTD_COLUMN_IDS, selectedColIds); conf.set(HiveProps.HIVE_READ_ALL_COLUMNS, "false"); }//w w w .j a v a 2s . c o m }
From source file:cascading.hive.RCFile.java
License:Apache License
@Override public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setInputFormat(RCFileInputFormat.class); if (selectedColIds != null) { conf.set(HiveProps.HIVE_SELECTD_COLUMN_IDS, selectedColIds); }/*w w w . java2 s .com*/ }