List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat
public void setInputFormat(Class<? extends InputFormat> theClass)
From source file:com.mh2c.WikipediaDumpLoaderDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // arg checks JobConf conf = new JobConf(getClass()); conf.setJobName("WP dump loader"); // Set the mapper class, but skip the reduce phase conf.setMapperClass(WikipediaDumpLoaderMapper.class); conf.setNumReduceTasks(0);/*from www . j a va2 s .co m*/ // The object key/value pairs are text conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); // Stream XML into the job conf.setInputFormat(StreamInputFormat.class); StreamInputFormat.addInputPath(conf, new Path(args[0])); // Use the XML record reader, with each page as one record conf.set("stream.recordreader.class", "org.apache.hadoop.streaming.StreamXmlRecordReader"); conf.set("stream.recordreader.begin", "<page>"); conf.set("stream.recordreader.end", "</page>"); // Emit sequence files conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
From source file:com.mh2c.WikipediaWordCountDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // arg checks JobConf conf = new JobConf(getClass()); conf.setJobName("WP word count"); // Set the mapper and reducer classes, and use the reducer as a combiner conf.setMapperClass(WikipediaWordCountMapper.class); conf.setReducerClass(WikipediaWordCountReducer.class); conf.setCombinerClass(WikipediaWordCountReducer.class); // The object key/value pairs are text words and integer counts conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); // Read in sequence files conf.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(conf, new Path(args[0])); // Emit ordinary text files conf.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);// w w w.j a v a 2 s. c o m return 0; }
From source file:com.mongodb.hadoop.examples.treasury.TreasuryYieldXMLConfigV2.java
License:Apache License
public int run(final String[] args) throws Exception { final Configuration conf = getConf(); final JobConf job = new JobConf(conf); job.setReducerClass(TreasuryYieldReducerV2.class); job.setMapperClass(TreasuryYieldMapperV2.class); job.setOutputFormat(MongoOutputFormat.class); job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf)); job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf)); job.setMapOutputKeyClass(MongoConfigUtil.getMapperOutputKey(conf)); job.setMapOutputValueClass(MongoConfigUtil.getMapperOutputValue(conf)); job.setInputFormat(MongoInputFormat.class); JobClient.runJob(job);// w w w. ja va 2s .c o m return 0; }
From source file:com.mongodb.hadoop.util.MongoTool.java
License:Apache License
private int runMapredJob(final Configuration conf) { final JobConf job = new JobConf(conf, getClass()); /**/*from w ww. j a v a2s . c om*/ * Any arguments specified with -D <property>=<value> * on the CLI will be picked up and set here * They override any XML level values * Note that -D<space> is important - no space will * not work as it gets picked up by Java itself */ // TODO - Do we need to set job name somehow more specifically? // This may or may not be correct/sane job.setJarByClass(getClass()); final Class<? extends org.apache.hadoop.mapred.Mapper> mapper = MapredMongoConfigUtil.getMapper(conf); LOG.debug("Mapper Class: " + mapper); LOG.debug("Input URI: " + conf.get(MapredMongoConfigUtil.INPUT_URI)); job.setMapperClass(mapper); Class<? extends org.apache.hadoop.mapred.Reducer> combiner = MapredMongoConfigUtil.getCombiner(conf); if (combiner != null) { job.setCombinerClass(combiner); } job.setReducerClass(MapredMongoConfigUtil.getReducer(conf)); job.setOutputFormat(MapredMongoConfigUtil.getOutputFormat(conf)); job.setOutputKeyClass(MapredMongoConfigUtil.getOutputKey(conf)); job.setOutputValueClass(MapredMongoConfigUtil.getOutputValue(conf)); job.setInputFormat(MapredMongoConfigUtil.getInputFormat(conf)); Class mapOutputKeyClass = MapredMongoConfigUtil.getMapperOutputKey(conf); Class mapOutputValueClass = MapredMongoConfigUtil.getMapperOutputValue(conf); if (mapOutputKeyClass != null) { job.setMapOutputKeyClass(mapOutputKeyClass); } if (mapOutputValueClass != null) { job.setMapOutputValueClass(mapOutputValueClass); } /** * Determines if the job will run verbosely e.g. print debug output * Only works with foreground jobs */ final boolean verbose = MapredMongoConfigUtil.isJobVerbose(conf); /** * Run job in foreground aka wait for completion or background? */ final boolean background = MapredMongoConfigUtil.isJobBackground(conf); try { RunningJob runningJob = JobClient.runJob(job); if (background) { LOG.info("Setting up and running MapReduce job in background."); return 0; } else { LOG.info("Setting up and running MapReduce job in foreground, will wait for results. {Verbose? " + verbose + "}"); runningJob.waitForCompletion(); return 0; } } catch (final Exception e) { LOG.error("Exception while executing job... ", e); return 1; } }
From source file:com.moz.fiji.express.flow.framework.MapredInputFormatWrapper.java
License:Apache License
/** * Sets jobs input format to {@link MapredInputFormatWrapper} and stores * supplied real {@link InputFormat} class name in job configuration. * This configuration is read on the remote tasks to instantiate actual * InputFormat correctly.//from w ww .ja v a 2 s. c o m */ public static void setInputFormat(Class<?> realInputFormatClass, JobConf jobConf) { jobConf.setInputFormat(MapredInputFormatWrapper.class); HadoopUtils.setClassConf(jobConf, CLASS_CONF_KEY, realInputFormatClass); }
From source file:com.mycompany.app.TestStagingDirectoryPermissions.java
License:Apache License
@Test public void perms() throws IOException, InterruptedException { MiniDFSCluster minidfs = null;//from ww w .jav a2 s. co m FileSystem fs = null; MiniMRClientCluster minimr = null; try { Configuration conf = new Configuration(true); conf.set("fs.permission.umask-mode", "0077"); minidfs = new MiniDFSCluster.Builder(conf).build(); minidfs.waitActive(); fs = minidfs.getFileSystem(); conf.set(FileSystem.FS_DEFAULT_NAME_KEY, fs.getUri().toString()); Path p = path("/in"); fs.mkdirs(p); FSDataOutputStream os = fs.create(new Path(p, "input.txt")); os.write("hello!".getBytes("UTF-8")); os.close(); String user = UserGroupInformation.getCurrentUser().getUserName(); Path home = new Path("/User/" + user); fs.mkdirs(home); minimr = MiniMRClientClusterFactory.create(this.getClass(), 1, conf); JobConf job = new JobConf(minimr.getConfig()); job.setJobName("PermsTest"); JobClient client = new JobClient(job); FileInputFormat.addInputPath(job, p); FileOutputFormat.setOutputPath(job, path("/out")); job.setInputFormat(TextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(MySleepMapper.class); job.setNumReduceTasks(1); RunningJob submittedJob = client.submitJob(job); // Sleep for a bit to let localization finish System.out.println("Sleeping..."); Thread.sleep(3 * 1000l); System.out.println("Done sleeping..."); assertFalse(UserGroupInformation.isSecurityEnabled()); Path stagingRoot = path("/tmp/hadoop-yarn/staging/" + user + "/.staging/"); assertTrue(fs.exists(stagingRoot)); assertEquals(1, fs.listStatus(stagingRoot).length); Path staging = fs.listStatus(stagingRoot)[0].getPath(); Path jobXml = path(staging + "/job.xml"); assertTrue(fs.exists(jobXml)); FileStatus fileStatus = fs.getFileStatus(jobXml); System.out.println("job.xml permission = " + fileStatus.getPermission()); assertTrue(fileStatus.getPermission().getOtherAction().implies(FsAction.READ)); assertTrue(fileStatus.getPermission().getGroupAction().implies(FsAction.READ)); submittedJob.waitForCompletion(); } finally { if (minimr != null) { minimr.stop(); } if (fs != null) { fs.close(); } if (minidfs != null) { minidfs.shutdown(true); } } }
From source file:com.mycompany.mavenproject1.App.java
public static void main(String[] args) throws IOException { // give time to attach debugger try {//from www. j a va 2 s . c om Thread.sleep(8000); } catch (InterruptedException ex) { Logger.getLogger(App.class.getName()).log(Level.SEVERE, null, ex); } JobConf conf = new JobConf(App.class); // purge existing output file FileSystem fs = FileSystem.get(conf); fs.delete(new Path(args[1]), true); // delete file, true for recursive conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(WholeFileInputFormat.class); // conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
From source file:com.mycompany.MyHadoopSamples1.TransposeJob.java
License:Apache License
public static Configuration buildTransposeJobConf(Configuration initialConf, Path matrixInputPath, Path matrixOutputPath, int numInputRows) throws IOException { JobConf conf = new JobConf(initialConf, TransposeJob.class); conf.setJobName("TransposeJob: " + matrixInputPath + " transpose -> " + matrixOutputPath); FileSystem fs = FileSystem.get(conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); conf.setInt(NUM_ROWS_KEY, numInputRows); FileInputFormat.addInputPath(conf, matrixInputPath); conf.setInputFormat(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(conf, matrixOutputPath); System.out.println("OUTPUT --> " + matrixOutputPath.toString()); conf.setMapperClass(TransposeMapper.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(VectorWritable.class); conf.setCombinerClass(MergeVectorsCombiner.class); conf.setReducerClass(MergeVectorsReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(VectorWritable.class); return conf;/*from w ww. j av a 2s . co m*/ }
From source file:com.mycompany.wordcount.WCMain.java
@Override public int run(String[] args) throws Exception { //throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. JobConf conf = new JobConf(WCMain.class); conf.setJobName("WordCount"); // key value//from w ww . j a v a 2s. co m conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); // mapper and reducer conf.setMapperClass(WCMapper.class); conf.setReducerClass(WCReducer.class); // input output format conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.addInputPath(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
From source file:com.ostor.dedup.hadoop.DedupStorHadoopCreateObjectsMapReduce.java
License:Open Source License
public static void main(String[] args) throws Exception { System.out.println("NOTE: Setting up logs from conf file - " + DedupStor.DEFAULT_LOG4J_FILE); PropertyConfigurator.configure(DedupStor.DEFAULT_LOG4J_FILE); JobConf conf = new JobConf(DedupStorHadoopCreateObjectsMapReduce.class); conf.setJobName("dedup-create-objects"); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(DedupObjectSegmentWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(DedupStorHadoopCreateObjectsMapper.class); conf.setReducerClass(DedupStorHadoopCreateObjectsReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); Path inputPath = new Path(args[0], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_OBJECTS_TMP_PATH); Path segmentStorPath = new Path(args[0], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_SEGMENTS_LOC_SUFFIX); Path objectStorPath = new Path(args[0], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_OBJECTS_LOC_SUFFIX); Path objectMapPath = new Path(args[0], DedupStorHadoopUtils.DEFAULT_DEDUP_STOR_HADOOP_OBJECTS_TMP_PATH); conf.set(DedupStorHadoopUtils.HADOOP_CONF_SEGMENTS_STOR_PATH_KEY, segmentStorPath.toString()); conf.set(DedupStorHadoopUtils.HADOOP_CONF_OBJECTS_STOR_PATH_KEY, objectStorPath.toString()); conf.set(DedupStorHadoopUtils.HADOOP_CONF_OBJECTS_TMP_PATH_KEY, objectMapPath.toString()); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, objectStorPath); JobClient.runJob(conf);/*from w w w . j a v a 2 s . c o m*/ }