List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass
public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException
From source file:ca.uwaterloo.cs.bigdata2017w.assignment0.PerfectX.java
License:Apache License
/** * Runs this tool.//from w w w .j a va 2 s . co m */ @Override public int run(String[] argv) throws Exception { final Args args = new Args(); CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(100)); try { parser.parseArgument(argv); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); return -1; } LOG.info("Tool: " + PerfectX.class.getSimpleName()); LOG.info(" - input path: " + args.input); LOG.info(" - output path: " + args.output); LOG.info(" - number of reducers: " + args.numReducers); LOG.info(" - use in-mapper combining: " + args.imc); Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJobName(PerfectX.class.getSimpleName()); job.setJarByClass(PerfectX.class); job.setNumReduceTasks(args.numReducers); FileInputFormat.setInputPaths(job, new Path(args.input)); FileOutputFormat.setOutputPath(job, new Path(args.output)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(args.imc ? MyMapperIMC.class : MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. Path outputDir = new Path(args.output); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ca.uwaterloo.cs.bigdata2017w.assignment0.WordCount.java
License:Apache License
/** * Runs this tool.//from w w w . jav a2 s.c o m */ @Override public int run(String[] argv) throws Exception { final Args args = new Args(); CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(100)); try { parser.parseArgument(argv); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); return -1; } LOG.info("Tool: " + WordCount.class.getSimpleName()); LOG.info(" - input path: " + args.input); LOG.info(" - output path: " + args.output); LOG.info(" - number of reducers: " + args.numReducers); LOG.info(" - use in-mapper combining: " + args.imc); Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJobName(WordCount.class.getSimpleName()); job.setJarByClass(WordCount.class); job.setNumReduceTasks(args.numReducers); FileInputFormat.setInputPaths(job, new Path(args.input)); FileOutputFormat.setOutputPath(job, new Path(args.output)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(args.imc ? MyMapperIMC.class : MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. Path outputDir = new Path(args.output); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ca.uwaterloo.cs.bigdata2017w.assignment4.BuildPersonalizedPageRankRecords.java
License:Apache License
/** * Runs this tool.//from ww w .java 2 s . c om */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("number of nodes").create(NUM_NODES)); options.addOption( OptionBuilder.withArgName("sources").hasArg().withDescription("source nodes").create(SOURCES)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(NUM_NODES)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int n = Integer.parseInt(cmdline.getOptionValue(NUM_NODES)); String sourcesString = cmdline.getOptionValue(SOURCES); String[] sources = sourcesString.split(","); for (int i = 0; i < sources.length; i++) { sources[i] = sources[i].trim(); } LOG.info("Tool name: " + BuildPersonalizedPageRankRecords.class.getSimpleName()); LOG.info(" - inputDir: " + inputPath); LOG.info(" - outputDir: " + outputPath); LOG.info(" - numNodes: " + n); LOG.info(" - use sources: " + sourcesString); Configuration conf = getConf(); conf.setInt(NODE_CNT_FIELD, n); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.setStrings(SOURCES, sources); Job job = Job.getInstance(conf); job.setJobName(BuildPersonalizedPageRankRecords.class.getSimpleName() + ":" + inputPath); job.setJarByClass(BuildPersonalizedPageRankRecords.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(PageRankNode.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java
License:Open Source License
@Test public void testCascade() throws IOException { getPlatform().copyFromLocal(inputFileApache); // Setup two standard cascading flows that will generate the input for the first MapReduceFlow Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false)); String sinkPath4 = getOutputPath("flow4"); Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath4, true), SinkMode.REPLACE); Flow firstFlow = getPlatform().getFlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow")); String sinkPath5 = getOutputPath("flow5"); Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath5, true), SinkMode.REPLACE); Flow secondFlow = getPlatform().getFlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow")); JobConf defaultConf = HadoopPlanner.createJobConf(getProperties()); JobConf firstConf = new JobConf(defaultConf); firstConf.setJobName("first-mr"); firstConf.setOutputKeyClass(LongWritable.class); firstConf.setOutputValueClass(Text.class); firstConf.setMapperClass(IdentityMapper.class); firstConf.setReducerClass(IdentityReducer.class); firstConf.setInputFormat(TextInputFormat.class); firstConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(firstConf, new Path(remove(sinkPath5, true))); String sinkPath1 = getOutputPath("flow1"); FileOutputFormat.setOutputPath(firstConf, new Path(remove(sinkPath1, true))); Flow firstMR = new MapReduceFlow(firstConf, true); JobConf secondConf = new JobConf(defaultConf); secondConf.setJobName("second-mr"); secondConf.setOutputKeyClass(LongWritable.class); secondConf.setOutputValueClass(Text.class); secondConf.setMapperClass(IdentityMapper.class); secondConf.setReducerClass(IdentityReducer.class); secondConf.setInputFormat(TextInputFormat.class); secondConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(secondConf, new Path(remove(sinkPath1, true))); String sinkPath2 = getOutputPath("flow2"); FileOutputFormat.setOutputPath(secondConf, new Path(remove(sinkPath2, true))); Flow secondMR = new MapReduceFlow(secondConf, true); Job job = new Job(defaultConf); job.setJobName("third-mr"); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class); job.setReducerClass(org.apache.hadoop.mapreduce.Reducer.class); job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class); job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); job.getConfiguration().set("mapred.mapper.new-api", "true"); job.getConfiguration().set("mapred.reducer.new-api", "true"); org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(remove(sinkPath2, true))); String sinkPath3 = getOutputPath("flow3"); org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job, new Path(remove(sinkPath3, true))); Flow thirdMR = new MapReduceFlow(new JobConf(job.getConfiguration()), true); CascadeConnector cascadeConnector = new CascadeConnector(); // pass out of order Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR); cascade.complete();//from w ww . j a va 2s . c om validateLength(new Hfs(new TextLine(), sinkPath3).openForRead(new HadoopFlowProcess(defaultConf)), 10); }
From source file:cassandra_mapreduce.MapReduceCassandraDB.java
License:GNU General Public License
public int run(String[] args) throws Exception { String columnName = "value"; getConf().set(CONF_COLUMN_NAME, columnName); getConf().set("mapred.job.tracker", args[0] + ":8021"); Job job = new Job(getConf(), "Phase1"); job.setJarByClass(MapReduceCassandraDB.class); job.setMapperClass(TokenizerMapper.class); job.setReducerClass(ReducerToCassandra.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); job.setInputFormatClass(ColumnFamilyInputFormat.class); job.setOutputFormatClass(ColumnFamilyOutputFormat.class); ConfigHelper.setRangeBatchSize(job.getConfiguration(), 800); ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY); ConfigHelper.setRpcPort(job.getConfiguration(), "9160"); ConfigHelper.setInitialAddress(job.getConfiguration(), args[0]); ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner"); ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY); SlicePredicate predicate = new SlicePredicate() .setColumn_names(Arrays.asList(ByteBuffer.wrap(columnName.getBytes()))); ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate); job.waitForCompletion(true);/*from www . j av a 2s . c o m*/ //Phase 2 Job job2 = new Job(getConf(), "Phase2"); job2.setJarByClass(MapReduceCassandraDB.class); job2.setMapperClass(Mapper2.class); job2.setReducerClass(Reducer2.class); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(IntWritable.class); job2.setOutputKeyClass(ByteBuffer.class); job2.setOutputValueClass(List.class); job2.setInputFormatClass(ColumnFamilyInputFormat.class); job2.setOutputFormatClass(ColumnFamilyOutputFormat.class); ConfigHelper.setOutputColumnFamily(job2.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY2); ConfigHelper.setRpcPort(job2.getConfiguration(), "9160"); ConfigHelper.setInitialAddress(job2.getConfiguration(), args[0]); ConfigHelper.setPartitioner(job2.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner"); ConfigHelper.setInputColumnFamily(job2.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY); SlicePredicate predicate2 = new SlicePredicate() .setColumn_names(Arrays.asList(ByteBuffer.wrap(columnName.getBytes()))); ConfigHelper.setInputSlicePredicate(job2.getConfiguration(), predicate2); job2.waitForCompletion(true); // job.setCombinerClass(IntSumReducer.class); // job.setReducerClass(IntSumReducer.class); // job.setOutputKeyClass(Text.class); // job.setOutputValueClass(Text.class); // // job.setInputFormatClass(ColumnFamilyInputFormat.class); // FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX)); // // ConfigHelper.setRpcPort(job.getConfiguration(), "9160"); // ConfigHelper.setInitialAddress(job.getConfiguration(), args[0]); // ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner"); // ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY); // SlicePredicate predicate = new SlicePredicate().setColumn_names(Arrays.asList(ByteBuffer.wrap(columnName.getBytes()))); // ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate); // // job.waitForCompletion(true); return 0; }
From source file:cc.slda.AnnotateDocuments.java
License:Apache License
/** * Runs this tool./* w w w .j a v a 2 s . com*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); options.addOption(OptionBuilder.withArgName(PCUTOFF).hasArg() .withDescription("probability of topic assignment").create(PCUTOFF)); options.addOption(OptionBuilder.withArgName(INDEX).hasArg() .withDescription("path to data directory containing term and title indices").create(INDEX)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(INDEX)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String indexPath = cmdline.getOptionValue(INDEX); String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; float cutoff = 0.9f; if (cmdline.hasOption(PCUTOFF)) { cutoff = Float.parseFloat(cmdline.getOptionValue(PCUTOFF)); } LOG.info("Tool: " + AnnotateDocuments.class.getSimpleName()); LOG.info(" - indices path: " + indexPath); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); LOG.info(" - log(probCutoff): " + Math.log(cutoff)); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Job job = Job.getInstance(conf); job.setJobName(AnnotateDocuments.class.getSimpleName()); job.setJarByClass(AnnotateDocuments.class); String termIndex = indexPath + Path.SEPARATOR + TERM; String titleIndex = indexPath + Path.SEPARATOR + TITLE; Path termIndexPath = new Path(termIndex); Path titleIndexPath = new Path(titleIndex); Preconditions.checkArgument(fs.exists(termIndexPath), "Missing term index files... " + termIndexPath); DistributedCache.addCacheFile(termIndexPath.toUri(), job.getConfiguration()); Preconditions.checkArgument(fs.exists(titleIndexPath), "Missing title index files... " + titleIndexPath); DistributedCache.addCacheFile(titleIndexPath.toUri(), job.getConfiguration()); job.setNumReduceTasks(reduceTasks); conf.setFloat(PCUTOFF, cutoff); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(HMapSIW.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(HMapSIW.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:cgl.hadoop.apps.runner.DataAnalysis.java
License:Open Source License
/** * Launch the MapReduce computation.// ww w.j a v a 2 s . com * This method first, remove any previous working directories and create a new one * Then the data (file names) is copied to this new directory and launch the * MapReduce (map-only though) computation. * @param numMapTasks - Number of map tasks. * @param numReduceTasks - Number of reduce tasks =0. * @param programDir - The directory where the Cap3 program is. * @param execName - Name of the executable. * @param dataDir - Directory where the data is located. * @param outputDir - Output directory to place the output. * @param cmdArgs - These are the command line arguments to the Cap3 program. * @throws Exception - Throws any exception occurs in this program. */ void launch(int numReduceTasks, String programDir, String execName, String workingDir, String databaseArchive, String databaseName, String dataDir, String outputDir, String cmdArgs) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, execName); // First get the file system handler, delete any previous files, add the // files and write the data to it, then pass its name as a parameter to // job Path hdMainDir = new Path(outputDir); FileSystem fs = FileSystem.get(conf); fs.delete(hdMainDir, true); Path hdOutDir = new Path(hdMainDir, "out"); // Starting the data analysis. Configuration jc = job.getConfiguration(); jc.set(WORKING_DIR, workingDir); jc.set(EXECUTABLE, execName); jc.set(PROGRAM_DIR, programDir); // this the name of the executable archive jc.set(DB_ARCHIVE, databaseArchive); jc.set(DB_NAME, databaseName); jc.set(PARAMETERS, cmdArgs); jc.set(OUTPUT_DIR, outputDir); // using distributed cache // flush it //DistributedCache.releaseCache(new URI(programDir), jc); //DistributedCache.releaseCache(new URI(databaseArchive), jc); //DistributedCache.purgeCache(jc); // reput the data into cache long startTime = System.currentTimeMillis(); //DistributedCache.addCacheArchive(new URI(databaseArchive), jc); DistributedCache.addCacheArchive(new URI(programDir), jc); System.out.println( "Add Distributed Cache in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); FileInputFormat.setInputPaths(job, dataDir); FileOutputFormat.setOutputPath(job, hdOutDir); job.setJarByClass(DataAnalysis.class); job.setMapperClass(RunnerMap.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(DataFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(numReduceTasks); startTime = System.currentTimeMillis(); int exitStatus = job.waitForCompletion(true) ? 0 : 1; System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); //clean the cache System.exit(exitStatus); }
From source file:chapter7.src.InputDriver.java
License:Apache License
public static void runJob(Path input, Path output, String vectorClassName, Configuration config) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = config;/*from w w w . j av a2s. c om*/ conf.set("vector.implementation.class.name", vectorClassName); Job job = new Job(conf, "Input Driver running over input: " + input); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(InputMapper.class); job.setNumReduceTasks(0); job.setJarByClass(InputDriver.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.waitForCompletion(true); }
From source file:cityhub.CityHub.java
@Override public int run(String[] strings) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "ReduceJoin"); job.setJarByClass(CityHub.class); MultipleInputs.addInputPath(job, new Path(strings[0]), TextInputFormat.class, JoinMapper1.class); MultipleInputs.addInputPath(job, new Path(strings[1]), TextInputFormat.class, JoinMapper2.class); job.getConfiguration().set("join.type", "innerjoin"); job.setReducerClass(JoinReducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, new Path(strings[2])); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); boolean complete = job.waitForCompletion(true); Configuration conf1 = new Configuration(); Job job2 = Job.getInstance(conf1, "chaining"); if (complete) { job2.setJarByClass(CityHub.class); MultipleInputs.addInputPath(job2, new Path(strings[2]), TextInputFormat.class, JoinMapper3.class); MultipleInputs.addInputPath(job2, new Path(strings[3]), TextInputFormat.class, JoinMapper4.class); job2.getConfiguration().set("join.type", "innerjoin"); job2.setReducerClass(JoinReducer1.class); job2.setOutputFormatClass(TextOutputFormat.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); TextOutputFormat.setOutputPath(job2, new Path(strings[4])); }/*from w w w .j ava2 s . c o m*/ boolean success = job2.waitForCompletion(true); return success ? 0 : 4; }
From source file:clustering.similarity.ISimDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 2) { System.err.printf("usage: %s simpre_dir output_dir " + "[compression_or_not] [reduce_task_number]\n", getClass().getSimpleName()); System.exit(1);// w w w . j ava2 s .com } Configuration conf = getConf(); conf = MapReduceUtils.initConf(conf); Job job = Job.getInstance(conf, "isim job"); job.setJarByClass(ISimDriver.class); if (args.length > 2 && args[2].equals("0")) { FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(KeyValueTextInputFormat.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); } else { job.setInputFormatClass(SequenceFileAsTextInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(args[0])); conf.setBoolean("mapreduce.map.output.compress", true); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.GzipCodec"); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.GzipCodec.class); SequenceFileOutputFormat.setOutputPath(job, new Path(args[1])); } if (args.length > 3) { conf.setInt("reduce.num", Integer.valueOf(args[3])); } else { conf.setInt("reduce.num", 5); } job.setMapperClass(ISimMapper.class); job.setMapOutputKeyClass(IntIntTupleWritable.class); job.setMapOutputValueClass(DoubleWritable.class); job.setCombinerClass(ISimCombiner.class); job.setPartitionerClass(HashPartitioner.class); job.setNumReduceTasks(conf.getInt("reduce.num", 1)); job.setReducerClass(ISimReducer.class); job.setOutputKeyClass(IntIntTupleWritable.class); job.setOutputValueClass(DoubleWritable.class); long starttime = System.currentTimeMillis(); boolean complete = job.waitForCompletion(true); long endtime = System.currentTimeMillis(); System.out.println("inverted similarity job finished in: " + (endtime - starttime) / 1000 + " seconds"); return complete ? 0 : 1; }