List of usage examples for org.apache.hadoop.mapred JobConf setOutputKeyClass
public void setOutputKeyClass(Class<?> theClass)
From source file:edu.ucsb.cs.partitioning.cosine.CosinePartitioning.java
License:Apache License
/** * Sets MapReduce input configurations for the core cosine partitioning job. *//*from ww w. j a va 2 s . c o m*/ public static JobConf setMapReduce(JobConf job, Class mapper, Class reducer) { job.setMapperClass(mapper); job.setMapOutputKeyClass(IntIntWritable.class); job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class); job.setNumReduceTasks(job.getInt(Config.NUM_PARTITIONS_PROPERTY, Config.NUM_PARTITIONS_VALUE)); job.setReducerClass(reducer); job.setOutputKeyClass(IntIntWritable.class); job.setOutputValueClass(IdFeatureWeightArrayWritable.class); return job; }
From source file:edu.ucsb.cs.partitioning.lsh.LshPartitionMain.java
License:Apache License
public static void main(String args[]) throws ParseException, IOException { JobConf job = new JobConf(); job.setJarByClass(LshPartitionMain.class); job.setJobName(LshPartitionMain.class.getSimpleName()); GenericOptionsParser gop = new GenericOptionsParser(job, args); args = gop.getRemainingArgs();/*from ww w .j av a 2 s . c om*/ job.setMapperClass(LshMapper.class); job.setMapOutputKeyClass(IntArrayWritable.class); // signatures job.setMapOutputValueClass(LongWritable.class); // doc IDs job.setNumReduceTasks(job.getInt(NUM_REDUCERS_PROPERTY, NUM_REDUCERS_VALUE)); job.setReducerClass(LshReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); String inputDir = args[0]; if (inputDir == null) { throw new UnsupportedOperationException("ERROR: input directory not set."); } FileInputFormat.addInputPath(job, new Path(inputDir)); Path outputPath = new Path("lsh-jaccard-buckets"); FileOutputFormat.setOutputPath(job, outputPath); FileSystem.get(job).delete(outputPath, true); LshTable lshTable = new LshTable(job.getInt(K_PROPERTY, K_VALUE), job.getInt(L_PROPERTY, L_VALUE), 1024, job.getLong(NUM_FEATURES_PROPERTY, NUM_FEATURES_VALUE), job.getFloat(THRESHOLD_PROPERTY, THRESHOLD_VALUE)); writeLsh(job, outputPath.getFileSystem(job), lshTable); run(job); }
From source file:edu.ucsb.cs.preprocessing.sequence.SeqWriter.java
License:Apache License
/** * Runs a MR job with maps only to convert input directory of numeric valued * records to hadoop sequence format. It assumes a text input of format of * [id feature weight ..] to be the format of input. *//*from www . j ava 2s.c o m*/ public static void writeSequence() throws IOException { JobConf job = new JobConf(); job.setJobName("Convert text vectors to hadoop seqeunce "); job.setJarByClass(SeqWriter.class); job.setMapperClass(SeqMapper.class); job.setNumReduceTasks(0); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(FeatureWeightArrayWritable.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FeatureWeightArrayWritable.class); job.setInputFormat(TextInputFormat.class); TextInputFormat.addInputPath(job, new Path(INPUT_DIR)); FileSystem.get(job).delete(new Path(HashPagesDriver.IDS_FILE2), true); Path outputPath = new Path(OUTPUT_DIR); FileSystem.get(job).delete(outputPath, true); job.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); JobSubmitter.run(job, "PREPROCESS", -1); }
From source file:edu.ucsb.cs.sort.length.LengthSortMain.java
License:Apache License
/** * Sets the job configurations including the mapper and reducer classes to * do the sorting based on vector lengths. *///from ww w . j av a 2 s . com public static void main(String[] args) throws IOException { JobConf job = new JobConf(); new GenericOptionsParser(job, args); job.setJobName(LengthSortMain.class.getSimpleName()); job.setJarByClass(LengthSortMain.class); job.setMapperClass(LengthSortMapper.class); job.setMapOutputKeyClass(FloatWritable.class); job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class); job.setPartitionerClass(LengthRangePartitioner.class); job.setReducerClass(LengthSortReducer.class); job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FeatureWeightArrayWritable.class); // // set input & output // String inputDir = SortDriver.INPUT_DIR; if (inputDir == null) { throw new UnsupportedOperationException("ERROR: input path not set"); } job.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(inputDir)); Path outputPath = new Path(SortDriver.OUTPUT_DIR); FileSystem.get(job).delete(outputPath, true); job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); // // run // JobSubmitter.run(job, "Sort By Vector Lenghts", -1); }
From source file:edu.ucsb.cs.sort.maxw.MaxwSortMain.java
License:Apache License
/** * Main method sets the job configurations including the mapper and reducer * classes to do the sorting./* w w w.j av a 2 s. co m*/ */ public static void main(String[] args) throws IOException { JobConf job = new JobConf(); new GenericOptionsParser(job, args); // ToolRunner.printGenericCommandUsage(System.out); job.setJobName(MaxwSortMain.class.getSimpleName()); job.setJarByClass(MaxwSortMain.class); job.setMapperClass(MaxwSortMapper.class); job.setMapOutputKeyClass(FloatWritable.class); job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class); job.setPartitionerClass(MaxwRangePartitioner.class); job.setReducerClass(MaxwSortReducer.class); job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FeatureWeightArrayWritable.class); // // set input & output // String inputDir = SortDriver.INPUT_DIR; if (inputDir == null) { throw new UnsupportedOperationException("ERROR: input path not set"); } job.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(inputDir)); Path outputPath = new Path(SortDriver.OUTPUT_DIR); FileSystem.get(job).delete(outputPath, true); job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); // // run // JobSubmitter.run(job, "Sort By infinity-Norm", -1); }
From source file:edu.ucsb.cs.sort.norm.NormSortMain.java
License:Apache License
/** * Main method sets the job configurations including the mapper and reducer * classes to do the sorting. Some of the produced partitions might be * merged later to reflect the number of partitions chosen by the user. *//* w ww . j a v a2 s. co m*/ public static void main(String[] args) throws IOException { JobConf job = new JobConf(); new GenericOptionsParser(job, args); job.setJobName("NormSort"); job.setJarByClass(NormSortMain.class); job.setMapperClass(NormSortMapper.class); job.setMapOutputKeyClass(FloatWritable.class); job.setMapOutputValueClass(IdFeatureWeightArrayWritable.class); job.setPartitionerClass(NormRangePartitioner.class); job.setReducerClass(NormSortReducer.class); job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FeatureWeightArrayWritable.class); // // set input & output // String inputDir = SortDriver.INPUT_DIR; if (inputDir == null) { throw new UnsupportedOperationException("ERROR: input path not set"); } job.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(inputDir)); Path outputPath = new Path(SortDriver.OUTPUT_DIR); FileSystem.get(job).delete(outputPath, true); job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); // // run // JobSubmitter.run(job, "Sort By p-norm", -1); }
From source file:edu.ucsb.cs.sort.signature.SigSortMain.java
License:Apache License
/** * Sets the job configurations including the mapper and reducer classes to * do the sorting based signatures.//from w w w . jav a 2 s . c om */ public static void main(String[] args) throws IOException { JobConf job = new JobConf(); new GenericOptionsParser(job, args); job.setJobName(SigSortMain.class.getSimpleName()); job.setJarByClass(SigSortMain.class); job.setMapperClass(SigSortMapper.class); job.setMapOutputKeyClass(BitSignature.class); job.setMapOutputValueClass(LongWritable.class); job.setPartitionerClass(SigRangePartitioner.class); job.setReducerClass(SigSortReducer.class); job.setNumReduceTasks(job.getInt(SortDriver.NUM_REDUCE_PROPERTY, SortDriver.NUM_REDUCE_VALUE)); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(BitSignature.class); // // set input & output // String inputDir = SortDriver.INPUT_DIR; if (inputDir == null) { throw new UnsupportedOperationException("ERROR: input path not set"); } job.setInputFormat(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(inputDir)); Path outputPath = new Path(OUTPUT_PATH); FileSystem.get(job).delete(outputPath, true); job.setOutputFormat(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); // // run // JobSubmitter.run(job, "Sort By Signature Bytes", -1); }
From source file:edu.umd.cloud9.collection.aquaint2.DemoCountAquaint2Documents.java
License:Apache License
/** * Runs this tool.//from w w w . j a v a2 s .c o m */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String mappingFile = args[2]; int mapTasks = Integer.parseInt(args[3]); System.out.println("input dir: " + inputPath); System.out.println("output dir: " + outputPath); System.out.println("mapping file: " + mappingFile); System.out.println("number of mappers: " + mapTasks); JobConf conf = new JobConf(DemoCountAquaint2Documents.class); conf.setJobName("DemoCountAquaint2Documents"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(Aquaint2DocumentInputFormatOld.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MyMapper.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); // clean up FileSystem.get(conf).delete(new Path(outputPath), true); return 0; }
From source file:edu.umd.cloud9.collection.aquaint2.NumberAquaint2Documents.java
License:Apache License
/** * Runs this tool.// w w w .j a va 2 s. c o m */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String outputFile = args[2]; int mapTasks = 10; LOG.info("Tool: " + NumberAquaint2Documents.class.getCanonicalName()); LOG.info(" - Input path: " + inputPath); LOG.info(" - Output path: " + outputPath); LOG.info(" - Output file: " + outputFile); JobConf conf = new JobConf(NumberAquaint2Documents.class); conf.setJobName(NumberAquaint2Documents.class.getSimpleName()); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(Aquaint2DocumentInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); Aquaint2DocnoMapping.writeDocnoData(new Path(outputPath + "/part-00000"), new Path(outputFile), FileSystem.get(conf)); return 0; }
From source file:edu.umd.cloud9.collection.clue.ClueWarcForwardIndexBuilder.java
License:Apache License
/** * Runs this tool.//from ww w .ja va2 s . c o m */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("(required) collection path (must be block-compressed SequenceFiles)") .create(COLLECTION_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output index path") .create(INDEX_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } JobConf conf = new JobConf(getConf(), ClueWarcForwardIndexBuilder.class); FileSystem fs = FileSystem.get(conf); String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexFile = cmdline.getOptionValue(INDEX_OPTION); LOG.info("Tool name: " + ClueWarcForwardIndexBuilder.class.getSimpleName()); LOG.info(" - collection path: " + collectionPath); LOG.info(" - index file: " + indexFile); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); Random random = new Random(); Path outputPath = new Path( "tmp-" + ClueWarcForwardIndexBuilder.class.getSimpleName() + "-" + random.nextInt(10000)); conf.setJobName(ClueWarcForwardIndexBuilder.class.getSimpleName() + ":" + collectionPath); conf.setNumMapTasks(100); conf.setNumReduceTasks(1); // Note, we have to add the files one by one, otherwise, SequenceFileInputFormat // thinks its a MapFile. for (FileStatus status : fs.listStatus(new Path(collectionPath))) { FileInputFormat.addInputPath(conf, status.getPath()); } FileOutputFormat.setOutputPath(conf, outputPath); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already fs.delete(outputPath, true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.findCounter(Blocks.Total).getCounter(); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF(ClueWarcForwardIndex.class.getCanonicalName()); out.writeUTF(collectionPath); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } fs.delete(outputPath, true); return 0; }