List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat
public void setInputFormat(Class<? extends InputFormat> theClass)
From source file:edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndexBuilder.java
License:Apache License
@SuppressWarnings("static-access") @Override//from w w w .j a v a2 s . c o m public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION)); options.addOption(OptionBuilder .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr") .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION)); String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION); String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000); if (!inputPath.isAbsolute()) { System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (!(language.length() == 2 || language.length() == 6)) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class); FileSystem fs = FileSystem.get(conf); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input path: " + inputPath); LOG.info(" - index file: " + indexFile); LOG.info(" - language: " + language); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language)); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(tmpPath)); FileOutputFormat.setCompressOutput(conf, false); if (language != null) { conf.set("wiki.language", language); } conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.getCounter(Blocks.Total); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName()); out.writeUTF(inputPath.toString()); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } // Clean up. fs.delete(new Path(tmpPath), true); return 0; }
From source file:edu.umd.cloud9.demo.DemoWordCountJSON.java
License:Apache License
/** * Runs this tool.//from w ww . j ava2 s .c o m */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int numMapTasks = Integer.parseInt(args[2]); int numReduceTasks = Integer.parseInt(args[3]); sLogger.info("Tool: DemoWordCountJSON"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - number of mappers: " + numMapTasks); sLogger.info(" - number of reducers: " + numReduceTasks); JobConf conf = new JobConf(DemoWordCountTuple1.class); conf.setJobName("DemoWordCountJSON"); conf.setNumMapTasks(numMapTasks); conf.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(MyKey.class); conf.setOutputValueClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setCombinerClass(MyReducer.class); conf.setReducerClass(MyReducer.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.cloud9.demo.DemoWordCountTuple1.java
License:Apache License
/** * Runs this tool.//from w ww . j a v a 2s . co m */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int numMapTasks = Integer.parseInt(args[2]); int numReduceTasks = Integer.parseInt(args[3]); sLogger.info("Tool: DemoWordCountTuple1"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - number of mappers: " + numMapTasks); sLogger.info(" - number of reducers: " + numReduceTasks); JobConf conf = new JobConf(DemoWordCountTuple1.class); conf.setJobName("DemoWordCountTuple1"); conf.setNumMapTasks(numMapTasks); conf.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(Tuple.class); conf.setOutputValueClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setCombinerClass(MyReducer.class); conf.setReducerClass(MyReducer.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.cloud9.demo.DemoWordCountTuple2.java
License:Apache License
/** * Runs this tool./*from w w w. j av a 2s . c o m*/ */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int numMapTasks = Integer.parseInt(args[2]); int numReduceTasks = Integer.parseInt(args[3]); sLogger.info("Tool: DemoWordCountTuple2"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - number of mappers: " + numMapTasks); sLogger.info(" - number of reducers: " + numReduceTasks); JobConf conf = new JobConf(DemoWordCountTuple2.class); conf.setJobName("DemoWordCountTuple2"); conf.setNumMapTasks(numMapTasks); conf.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(Tuple.class); conf.setOutputValueClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(ReduceClass.class); conf.setReducerClass(ReduceClass.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:edu.umd.cloud9.example.simple.DemoMapredNullInput.java
License:Apache License
/** * Runs the demo.//w ww . j ava2 s.com */ public static void main(String[] args) throws IOException { JobConf conf = new JobConf(DemoMapredNullInput.class); conf.setJobName("DemoMapredNullInput"); conf.setNumMapTasks(10); conf.setNumReduceTasks(0); conf.setInputFormat(NullInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); JobClient.runJob(conf); }
From source file:edu.umd.cloud9.example.simple.DemoNullInput.java
License:Apache License
/** * Runs the demo.// w ww. j av a2 s. c om */ public static void main(String[] args) throws IOException { JobConf conf = new JobConf(DemoNullInput.class); conf.setJobName("DemoNullInput"); conf.setNumMapTasks(10); conf.setNumReduceTasks(0); conf.setInputFormat(NullInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); JobClient.runJob(conf); }
From source file:edu.umd.cloud9.io.benchmark.HadoopSortRandomPairsOfInts.java
License:Apache License
/** * Runs this benchmark.//from www.j a v a2 s. c o m */ public static void main(String[] args) throws IOException { String inputPath = "random-pairs.seq"; String outputPath = "random-pairs.sorted"; int numMapTasks = 1; int numReduceTasks = 1; JobConf conf = new JobConf(HadoopSortRandomPairsOfInts.class); conf.setJobName("SortRandomPairsOfInts"); conf.setNumMapTasks(numMapTasks); conf.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(PairOfInts.class); conf.setOutputValueClass(IntWritable.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(IdentityMapper.class); conf.setCombinerClass(IdentityReducer.class); conf.setReducerClass(IdentityReducer.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime; double duration; startTime = System.currentTimeMillis(); JobClient.runJob(conf); duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Job took " + duration + " seconds"); }
From source file:edu.umd.cloud9.pagerank.BuildPageRankRecords.java
License:Apache License
/** * Runs this tool.//from www .j a va 2s.c o m */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int n = Integer.parseInt(args[2]); sLogger.info("Tool name: BuildPageRankRecords"); sLogger.info(" - inputDir: " + inputPath); sLogger.info(" - outputDir: " + outputPath); sLogger.info(" - numNodes: " + n); JobConf conf = new JobConf(BuildPageRankRecords.class); conf.setJobName("PackageLinkGraph"); conf.setNumMapTasks(1); conf.setNumReduceTasks(0); conf.setInt("NodeCnt", n); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); TextInputFormat.addInputPath(conf, new Path(inputPath)); SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(PageRankNode.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.pagerank.FindMaxPageRankNodes.java
License:Apache License
/** * Runs this tool./* www. j a va 2 s . co m*/ */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int n = Integer.parseInt(args[2]); sLogger.info("Tool name: FindMaxPageRankNodes"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); sLogger.info(" - n: " + n); JobConf conf = new JobConf(FindMaxPageRankNodes.class); conf.setJobName("FindMaxPageRankNodes"); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.setInt("n", n); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(FloatWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(FloatWritable.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.pagerank.PartitionGraph.java
License:Apache License
public int run(String[] args) throws IOException { if (args.length != 5) { printUsage();/*from w ww .java 2 s. c o m*/ return -1; } String inPath = args[0]; String outPath = args[1]; int numParts = Integer.parseInt(args[2]); boolean useRange = Integer.parseInt(args[3]) != 0; int nodeCount = Integer.parseInt(args[4]); sLogger.info("Tool name: PartitionGraph"); sLogger.info(" - inputDir: " + inPath); sLogger.info(" - outputDir: " + outPath); sLogger.info(" - numPartitions: " + numParts); sLogger.info(" - useRange?: " + useRange); sLogger.info(" - nodeCnt: " + nodeCount); JobConf conf = new JobConf(PartitionGraph.class); conf.setJobName("Partition Graph " + numParts); conf.setNumReduceTasks(numParts); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("NodeCount", nodeCount); FileInputFormat.setInputPaths(conf, new Path(inPath)); FileOutputFormat.setOutputPath(conf, new Path(outPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(PageRankNode.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); conf.setMapperClass(MapClass.class); conf.setReducerClass(ReduceClass.class); conf.setSpeculativeExecution(false); if (useRange) { conf.setPartitionerClass(RangePartitioner.class); } FileSystem.get(conf).delete(new Path(outPath), true); JobClient.runJob(conf); return 0; }