List of usage examples for org.apache.hadoop.mapreduce Job setSortComparatorClass
public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException
From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.CosineMain.java
License:Open Source License
public int run(String[] args) throws IOException { Configuration conf = getConf(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.ConsineMain <input> <output>"); System.exit(2);// w w w.ja v a 2 s . co m } Job job1 = new Job(conf, "ConsineMain"); job1.setJarByClass(CosineMain.class); job1.setMapperClass(AggregateReadingsMapper.class); job1.setMapOutputKeyClass(LongWritable.class); job1.setMapOutputValueClass(DoubleWritable.class); job1.setReducerClass(AggregateReadingsReducer.class); job1.setOutputKeyClass(LongWritable.class); job1.setOutputValueClass(Text.class); FileInputFormat.setInputDirRecursive(job1, true); FileInputFormat.setInputPaths(job1, new Path(otherArgs[0])); int lastIdx = otherArgs[0].lastIndexOf("/"); String tempOutput = otherArgs[0].substring(0, lastIdx) + "/temp"; FileOutputFormat.setOutputPath(job1, new Path(tempOutput)); System.out.println("\nStarting Job-1 ..."); final long startTime = System.currentTimeMillis(); try { final long startTimeJob1 = System.currentTimeMillis(); if (!job1.waitForCompletion(true)) { System.out.println("Job-1 failed."); } else { System.out.println("Duration of Job1 " + ((System.currentTimeMillis() - startTimeJob1) / 1000.0) + " seconds."); final Job job2 = new Job(conf, "ConsineMain Aggregate"); job2.setJarByClass(CosineMain.class); job2.setInputFormatClass(CartesianInputFormat.class); CartesianInputFormat.setLeftInputInfo(job2, TextInputFormat.class, tempOutput); CartesianInputFormat.setRightInputInfo(job2, TextInputFormat.class, tempOutput); FileOutputFormat.setOutputPath(job2, new Path(otherArgs[1])); job2.setMapperClass(CartesianProductMapper.class); job2.setMapOutputKeyClass(DoubleWritable.class); job2.setMapOutputValueClass(Text.class); job2.setSortComparatorClass(DescendingKeyComparator.class); job2.setReducerClass(CartesianProductReducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(DoubleWritable.class); job2.setNumReduceTasks(10); final long startTimeJob2 = System.currentTimeMillis(); System.out.println("\nStarting Job-2 ..."); if (!job2.waitForCompletion(true)) { System.out.println("Job-2 failed."); } else { System.out.println("Duration of Job2: " + ((System.currentTimeMillis() - startTimeJob2) / 1000.0) + " seconds."); } } FileSystem fs = FileSystem.get(conf); fs.delete(new Path(tempOutput), true); } catch (Exception e) { throw new RuntimeException(e); } finally { final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Total Duration: " + duration + " seconds."); } return 0; }
From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.CosineMain.java
License:Open Source License
public int run1(String[] args) throws IOException { if (args.length != 3) { System.err.println("Usage: java " + getClass().getName() + " <inputDir> <outDir> <ntasks>"); ToolRunner.printGenericCommandUsage(System.err); return -1; }/*from w w w . j a va 2 s .c o m*/ Configuration conf = getConf(); final Job job2 = new Job(conf, "ConsineMain cartesian product"); job2.setJarByClass(CosineMain.class); job2.setInputFormatClass(CartesianInputFormat.class); CartesianInputFormat.setLeftInputInfo(job2, TextInputFormat.class, args[0]); CartesianInputFormat.setRightInputInfo(job2, TextInputFormat.class, args[0]); FileOutputFormat.setOutputPath(job2, new Path(args[1])); job2.setMapperClass(CartesianProductMapper.class); job2.setMapOutputKeyClass(DoubleWritable.class); job2.setMapOutputValueClass(Text.class); job2.setSortComparatorClass(DescendingKeyComparator.class); job2.setReducerClass(CartesianProductReducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(DoubleWritable.class); job2.setNumReduceTasks(Integer.parseInt(args[2])); System.out.println("\nStarting Job-2 ..."); final long startTime = System.currentTimeMillis(); try { if (!job2.waitForCompletion(true)) { System.out.println("Job-2 failed."); System.exit(1); } } catch (Exception e) { throw new RuntimeException(e); } finally { final double duration = (System.currentTimeMillis() - startTime) / 1000.0; System.out.println("Duration is " + duration + " seconds."); } return 0; }
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunnerTest.java
License:Apache License
/** * Test for map-reduce job./*from ww w .ja v a 2 s. com*/ * @throws Exception if failed */ @Test public void map_reduce() throws Exception { Job job = newJob(); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(WordCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setSortComparatorClass(Text.Comparator.class); job.setGroupingComparatorClass(Text.Comparator.class); job.setReducerClass(WordCountReducer.class); job.setNumReduceTasks(1); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); File inputDir = folder.newFolder(); File inputFile = new File(inputDir, "input.txt"); write(inputFile, new String[] { "a b c d", "a a b c", "c", }); File outputDir = folder.newFolder(); outputDir.delete(); FileInputFormat.setInputPaths(job, new Path(inputFile.toURI())); FileOutputFormat.setOutputPath(job, new Path(outputDir.toURI())); assertThat(new SimpleJobRunner().run(job), is(true)); assertThat(toMap(read(outputDir)), is(map(new String[] { "a", "3", "b", "2", "c", "3", "d", "1", }))); }
From source file:com.asakusafw.runtime.stage.AbstractStageClient.java
License:Apache License
@SuppressWarnings("rawtypes") private void configureShuffle(Job job, VariableTable variables) { Class<? extends Reducer> reducer = getReducerClassOrNull(); if (reducer != null) { if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format("Reducer: {0}", reducer.getName())); //$NON-NLS-1$ }/*w w w . ja v a 2 s. c o m*/ job.setReducerClass(reducer); } else { if (LOG.isDebugEnabled()) { LOG.debug("Reducer: N/A"); //$NON-NLS-1$ } job.setNumReduceTasks(0); return; } Class<? extends Writable> outputKeyClass = or(getShuffleKeyClassOrNull(), NullWritable.class); Class<? extends Writable> outputValueClass = or(getShuffleValueClassOrNull(), NullWritable.class); if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format("Shuffle: key={0}, value={1}", //$NON-NLS-1$ outputKeyClass.getName(), outputValueClass.getName())); } job.setMapOutputKeyClass(outputKeyClass); job.setMapOutputValueClass(outputValueClass); Class<? extends Reducer> combiner = getCombinerClassOrNull(); if (combiner != null) { if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format("Combiner: {0}", combiner.getName())); //$NON-NLS-1$ } job.setCombinerClass(combiner); } else { if (LOG.isDebugEnabled()) { LOG.debug("Combiner: N/A"); //$NON-NLS-1$ } } Class<? extends Partitioner> partitioner = getPartitionerClassOrNull(); if (partitioner != null) { if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format("Partitioner: {0}", partitioner.getName())); //$NON-NLS-1$ } job.setPartitionerClass(partitioner); } else { if (LOG.isDebugEnabled()) { LOG.debug("Partitioner: DEFAULT"); //$NON-NLS-1$ } } Class<? extends RawComparator> groupingComparator = getGroupingComparatorClassOrNull(); if (groupingComparator != null) { if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format("GroupingComparator: {0}", groupingComparator.getName())); //$NON-NLS-1$ } job.setGroupingComparatorClass(groupingComparator); } else { if (LOG.isDebugEnabled()) { LOG.debug("GroupingComparator: DEFAULT"); //$NON-NLS-1$ } } Class<? extends RawComparator> sortComparator = getSortComparatorClassOrNull(); if (sortComparator != null) { if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format("SortComparator: {0}", sortComparator.getName())); //$NON-NLS-1$ } job.setSortComparatorClass(sortComparator); } else { if (LOG.isDebugEnabled()) { LOG.debug("SortComparator: DEFAULT"); //$NON-NLS-1$ } } }
From source file:com.asakusafw.thundergate.runtime.cache.mapreduce.CacheBuildClient.java
License:Apache License
private void updateMerge() throws IOException, InterruptedException { Job job = newJob(); List<StageInput> inputList = new ArrayList<>(); inputList.add(new StageInput(storage.getHeadContents("*").toString(), TemporaryInputFormat.class, MergeJoinBaseMapper.class)); inputList.add(new StageInput(storage.getPatchContents("*").toString(), TemporaryInputFormat.class, MergeJoinPatchMapper.class)); StageInputDriver.set(job, inputList); job.setInputFormatClass(StageInputFormat.class); job.setMapperClass(StageInputMapper.class); job.setMapOutputKeyClass(PatchApplyKey.class); job.setMapOutputValueClass(modelClass); // combiner may have no effect in normal cases job.setReducerClass(MergeJoinReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(modelClass); job.setPartitionerClass(PatchApplyKey.Partitioner.class); job.setSortComparatorClass(PatchApplyKey.SortComparator.class); job.setGroupingComparatorClass(PatchApplyKey.GroupComparator.class); TemporaryOutputFormat.setOutputPath(job, getNextDirectory()); job.setOutputFormatClass(TemporaryOutputFormat.class); job.getConfiguration().setClass("mapred.output.committer.class", LegacyBridgeOutputCommitter.class, org.apache.hadoop.mapred.OutputCommitter.class); LOG.info(MessageFormat.format("applying patch (merge join): {0} / {1} -> {2}", storage.getPatchContents("*"), storage.getHeadContents("*"), getNextContents())); try {/*from w ww. j a va2 s. c om*/ boolean succeed = job.waitForCompletion(true); LOG.info(MessageFormat.format("applied patch (merge join): succeed={0}, {1} / {2} -> {3}", succeed, storage.getPatchContents("*"), storage.getHeadContents("*"), getNextContents())); if (succeed == false) { throw new IOException(MessageFormat.format("failed to apply patch (merge join): {0} / {1} -> {2}", storage.getPatchContents("*"), storage.getHeadContents("*"), getNextContents())); } } catch (ClassNotFoundException e) { throw new IOException(e); } putMeta(); }
From source file:com.bark.hadoop.lab3.PageRank.java
@Override public int run(String args[]) { String tmp = "/tmp/" + new Date().getTime(); // long timeStamp = new Date().getTime(); try {/*from w w w.j ava 2 s .c o m*/ /** * Job 1: Parse XML input and read title,links */ Configuration conf = new Configuration(); conf.set("xmlinput.start", "<page>"); conf.set("xmlinput.end", "</page>"); Job job = Job.getInstance(conf); job.setJarByClass(PageRank.class); // specify a mapper job.setMapperClass(RedLinkMapper.class); // specify a reducer job.setReducerClass(RedLinkReducer.class); // specify output types job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // specify input and output DIRECTORIES FileInputFormat.addInputPath(job, new Path(args[0])); job.setInputFormatClass(XmlInputFormat.class); FileOutputFormat.setOutputPath(job, new Path((args[1] + tmp + "/job1"))); job.setOutputFormatClass(TextOutputFormat.class); job.waitForCompletion(true); } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job1."); return 2; } /** * Job 2: Adjacency outGraph */ try { Configuration conf2 = new Configuration(); Job job2 = Job.getInstance(conf2); job2.setJarByClass(PageRank.class); // specify a mapper job2.setMapperClass(AdjMapper.class); // specify a reducer job2.setReducerClass(AdjReducer.class); // specify output types job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); // specify input and output DIRECTORIES FileInputFormat.addInputPath(job2, new Path((args[1] + tmp + "/job1"))); job2.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job2, new Path((args[1] + tmp + "/job2"))); job2.setOutputFormatClass(TextOutputFormat.class); job2.waitForCompletion(true); } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job2."); return 2; } /** * Job 3: PageCount */ try { Configuration conf3 = new Configuration(); /** * Change output separator to "=" instead of default \t for this job */ conf3.set("mapreduce.output.textoutputformat.separator", "="); Job job3 = Job.getInstance(conf3); job3.setJarByClass(PageRank.class); // specify a mapper job3.setMapperClass(PageCountMapper.class); // specify a reducer job3.setReducerClass(PageCountReducer.class); // specify output types job3.setOutputKeyClass(Text.class); job3.setOutputValueClass(IntWritable.class); // specify input and output DIRECTORIES FileInputFormat.addInputPath(job3, new Path((args[1] + tmp + "/job2"))); job3.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job3, new Path((args[1] + tmp + "/job3"))); job3.setOutputFormatClass(TextOutputFormat.class); job3.waitForCompletion(true); } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job3."); return 2; } /** * Job 4: PageRank */ for (int i = 1; i < 9; i++) { try { Configuration conf4 = new Configuration(); /** * Read number of nodes from the output of job 3 : pageCount */ Path path = new Path((args[1] + tmp + "/job3")); FileSystem fs = path.getFileSystem(conf4); RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true); int n = 0; Pattern pt = Pattern.compile("(\\d+)"); while (ri.hasNext()) { LocatedFileStatus lfs = ri.next(); if (lfs.isFile() && n == 0) { FSDataInputStream inputStream = fs.open(lfs.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)); String s = null; while ((s = br.readLine()) != null) { Matcher mt = pt.matcher(s); if (mt.find()) { n = new Integer(mt.group(1)); break; } } } } /** * Done reading number of nodes, make it available to MapReduce * job key: N */ conf4.setInt("N", n); Job job4 = Job.getInstance(conf4); job4.setJarByClass(PageRank.class); // specify a mapper job4.setMapperClass(PageRankMapper.class); // specify a reducer job4.setReducerClass(PageRankReducer.class); // specify output types job4.setOutputKeyClass(Text.class); job4.setOutputValueClass(Text.class); // specify input and output DIRECTORIES if (i == 1) { FileInputFormat.addInputPath(job4, new Path((args[1] + tmp + "/job2"))); } else { FileInputFormat.addInputPath(job4, new Path((args[1] + tmp + "/job4/" + (i - 1)))); } job4.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job4, new Path((args[1] + tmp + "/job4/" + i))); job4.setOutputFormatClass(TextOutputFormat.class); job4.waitForCompletion(true); } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job4."); return 2; } } /** * Job 5: Sort iteration 1 and iteration 8 */ int returnCode = 0; for (int i = 0; i < 2; i++) { try { Configuration conf5 = new Configuration(); /** * Read number of nodes from the output of job 3 : pageCount */ Path path = new Path((args[1] + tmp + "/job3")); FileSystem fs = path.getFileSystem(conf5); RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true); int n = 0; Pattern pt = Pattern.compile("(\\d+)"); while (ri.hasNext()) { LocatedFileStatus lfs = ri.next(); if (lfs.isFile() && n == 0) { FSDataInputStream inputStream = fs.open(lfs.getPath()); BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)); String s = null; while ((s = br.readLine()) != null) { Matcher mt = pt.matcher(s); if (mt.find()) { n = new Integer(mt.group(1)); break; } } } } /** * Done reading number of nodes, make it available to MapReduce * job key: N */ conf5.setInt("N", n); Job job5 = Job.getInstance(conf5); /** * one reducer only */ job5.setNumReduceTasks(1); job5.setSortComparatorClass(MyWritableComparator.class); job5.setJarByClass(PageRank.class); // specify a mapper job5.setMapperClass(SortMapper.class); job5.setMapOutputKeyClass(DoubleWritable.class); job5.setMapOutputValueClass(Text.class); // specify a reducer job5.setReducerClass(SortReducer.class); // specify output types job5.setOutputKeyClass(Text.class); job5.setOutputValueClass(DoubleWritable.class); // specify input and output DIRECTORIES int y = 7 * i + 1; FileInputFormat.addInputPath(job5, new Path((args[1] + tmp + "/job4/" + y))); job5.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job5, new Path((args[1] + tmp + "/job5/" + y))); job5.setOutputFormatClass(TextOutputFormat.class); returnCode = job5.waitForCompletion(true) ? 0 : 1; } catch (InterruptedException | ClassNotFoundException | IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error during mapreduce job5."); return 2; } } /** * Copy necessary output files to args[1] /** * Copy necessary output files to args[1] */ /** * Rename and copy OutLinkGraph */ try { Configuration conf = new Configuration(); Path outLinkGraph = new Path((args[1] + tmp + "/job2/part-r-00000")); FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf); Path output = new Path(args[1] + "/results/PageRank.outlink.out"); FileSystem outputFS = output.getFileSystem(conf); org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf); } catch (IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error while copying results."); return 2; } /** * Rename and copy total number of pages */ try { Configuration conf = new Configuration(); Path outLinkGraph = new Path((args[1] + tmp + "/job3/part-r-00000")); FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf); Path output = new Path(args[1] + "/results/PageRank.n.out"); FileSystem outputFS = output.getFileSystem(conf); org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf); } catch (IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error while copying results."); return 2; } /** * Rename and copy iteration 1 */ try { Configuration conf = new Configuration(); Path outLinkGraph = new Path((args[1] + tmp + "/job5/1/part-r-00000")); FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf); Path output = new Path(args[1] + "/results/PageRank.iter1.out"); FileSystem outputFS = output.getFileSystem(conf); org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf); } catch (IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error while copying results."); return 2; } /** * Rename and copy iteration 8 */ try { Configuration conf = new Configuration(); Path outLinkGraph = new Path((args[1] + tmp + "/job5/8/part-r-00000")); FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf); Path output = new Path(args[1] + "/results/PageRank.iter8.out"); FileSystem outputFS = output.getFileSystem(conf); org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf); } catch (IOException ex) { Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex); System.err.println("Error while copying results."); return 2; } return returnCode; }
From source file:com.bigfishgames.biginsights.upsight.mapreduce.MapReduceAvroWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: AvroWordCount <input path> <output path>"); return -1; }/* w w w. jav a 2 s . c o m*/ Job job = new Job(getConf()); job.setJarByClass(MapReduceAvroWordCount.class); job.setJobName("wordcount"); // We call setOutputSchema first so we can override the configuration // parameters it sets // AvroJob.setOutputKeySchema(job, // Pair.getPairSchema(Schema.create(Type.STRING), // Schema.create(Type.NULL))); AvroJob.setOutputKeySchema(job, Event.getClassSchema()); job.setOutputValueClass(NullWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(MyAvroKeyOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setSortComparatorClass(Text.Comparator.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); return 0; }
From source file:com.bizosys.hsearch.kv.indexing.KVIndexer.java
License:Apache License
private static int runJob(int jobTypeI, Job job, FieldMapping fm, String input, String output, int scannerCacheSize, String filter) throws IOException, InterruptedException, ClassNotFoundException { int jobStatus = -1; switch (jobTypeI) { case SF2HB: { IdSearchLog.l.info("Starting Job for SF2HB input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and output folder " + output); FileInputFormat.addInputPath(job, new Path(input)); job.setMapperClass(KVMapperFile.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(KVReducerHBase.class); TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; }//from w ww . j a va2 s .co m case SF2HF: { //First creates map file and then convert to hfile. //create intermediate dir for map file output String intermediateFolder = output + "_intermediate"; Path intermediateOutpurDir = new Path(intermediateFolder); IdSearchLog.l.info("Starting Job for SF2HF input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and intremediate output folder " + intermediateFolder + " final output dir " + output); //reset the output folder to intermediate folder Configuration conf = job.getConfiguration(); conf.set(OUTPUT_FOLDER, intermediateFolder); int jobT = JobTypeMapping.get("SF2MF"); jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter); if (jobStatus == 0) { Configuration hfileConf = HBaseConfiguration.create(); hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH)); Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile"); String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME; jobT = JobTypeMapping.get("IMF2HF"); jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter); } //delete intermediate dir FileSystem.get(conf).delete(intermediateOutpurDir, true); //delete the empty _SUCCESS folder FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true); return jobStatus; } case SF2MF: { IdSearchLog.l.info("Starting Job for SF2MF input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and output folder " + output); FileInputFormat.addInputPath(job, new Path(input)); job.setMapperClass(KVMapperFile.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(Text.class); job.setSortComparatorClass(TextPair.FirstComparator.class); job.setReducerClass(KVReducerMapFile.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ImmutableBytesWritable.class); LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case MF2HB: { job.setMapperClass(KVMapperMapFile.class); job.setInputFormatClass(SequenceFileAsTextInputFormat.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(Text.class); SequenceFileAsTextInputFormat.addInputPath(job, new Path(input)); job.setReducerClass(KVReducerHBase.class); TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case MF2HF: { String intermediateFolder = output + "_intermediate"; Path intermediateOutpurDir = new Path(intermediateFolder); IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and intremediate output folder " + intermediateFolder + " final output dir " + output); //reset the output folder to intermediate folder Configuration conf = job.getConfiguration(); conf.set(OUTPUT_FOLDER, intermediateFolder); int jobT = JobTypeMapping.get("MF2MF"); jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter); if (jobStatus == 0) { Configuration hfileConf = HBaseConfiguration.create(); hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH)); Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile"); String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME; jobT = JobTypeMapping.get("IMF2HF"); jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter); } //delete intermediate dir FileSystem.get(conf).delete(intermediateOutpurDir, true); //delete the empty _SUCCESS folder FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true); return jobStatus; } case MF2MF: { job.setMapperClass(KVMapperMapFile.class); job.setInputFormatClass(SequenceFileAsTextInputFormat.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(Text.class); SequenceFileAsTextInputFormat.addInputPath(job, new Path(input)); job.setReducerClass(KVReducerMapFile.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ImmutableBytesWritable.class); LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case HB2HB: { if (fm.tableName.equals(input)) { throw new IOException("Input table and index table can not be same"); } Scan scan = new Scan(); scan.setCaching(scannerCacheSize); scan.setCacheBlocks(false); scan.addFamily(fm.familyName.getBytes()); if (null != filter) { if (filter.trim().length() > 0) { int index = filter.indexOf('='); scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(), filter.substring(0, index).getBytes(), CompareOp.EQUAL, filter.substring(index + 1).getBytes())); } } TableMapReduceUtil.initTableMapperJob(input, // input table scan, // Scan instance to control CF and attribute selection KVMapperHBase.class, // mapper class TextPair.class, // mapper output key Text.class, // mapper output value job); TableMapReduceUtil.initTableReducerJob(fm.tableName, // output table KVReducerHBase.class, // reducer class job); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case HB2HF: { String intermediateFolder = output + "_intermediate"; Path intermediateOutpurDir = new Path(intermediateFolder); IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR + " using hbase table : " + fm.tableName + " and intremediate output folder " + intermediateFolder + " final output dir " + output); //reset the output folder to intermediate folder Configuration conf = job.getConfiguration(); conf.set(OUTPUT_FOLDER, intermediateFolder); int jobT = JobTypeMapping.get("HB2MF"); jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter); if (jobStatus == 0) { Configuration hfileConf = HBaseConfiguration.create(); hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH)); Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile"); String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME; jobT = JobTypeMapping.get("IMF2HF"); jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter); } //delete intermediate dir FileSystem.get(conf).delete(intermediateOutpurDir, true); //delete the empty _SUCCESS folder FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true); return jobStatus; } case HB2MF: { if (fm.tableName.equals(input)) { throw new IOException("Input table and index table can not be same"); } Scan scan = new Scan(); scan.setCaching(scannerCacheSize); scan.setCacheBlocks(false); scan.addFamily(fm.familyName.getBytes()); if (null != filter) { if (filter.trim().length() > 0) { int index = filter.indexOf('='); scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(), filter.substring(0, index).getBytes(), CompareOp.EQUAL, filter.substring(index + 1).getBytes())); } } TableMapReduceUtil.initTableMapperJob(input, // input table scan, // Scan instance to control CF and attribute selection KVMapperHBase.class, // mapper class TextPair.class, // mapper output key Text.class, // mapper output value job); job.setReducerClass(KVReducerMapFile.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ImmutableBytesWritable.class); LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } case IMF2HF: { Path finalOutputDir = new Path(output); job.setJarByClass(KVIndexer.class); job.setMapperClass(KVMapperHFile.class); job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, finalOutputDir); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); HTable hTable = new HTable(job.getConfiguration(), fm.tableName); HFileOutputFormat.configureIncrementalLoad(job, hTable); jobStatus = job.waitForCompletion(true) ? 0 : 1; return jobStatus; } default: throw new IOException("Invalid Jobtype " + jobTypeI); } }
From source file:com.ci.backports.avro.mapreduce.AvroJob.java
License:Apache License
/** Configure a job's map output key schema. */ public static void setMapOutputKeySchema(Job job, Schema schema) { job.setMapOutputKeyClass(AvroKey.class); job.getConfiguration().set(KEY_MAP_OUTPUT_SCHEMA_CONFIG_FIELD, schema.toString()); job.setGroupingComparatorClass(AvroKeyComparator.class); job.setSortComparatorClass(AvroKeyComparator.class); addAvroSerialization(job.getConfiguration()); }
From source file:com.cloudera.avro.MapReduceAvroWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { System.err.println("Usage: AvroWordCount <input path> <output path>"); return -1; }/* w ww . j a v a 2 s. c o m*/ Job job = new Job(getConf()); job.setJarByClass(MapReduceAvroWordCount.class); job.setJobName("wordcount"); // We call setOutputSchema first so we can override the configuration // parameters it sets AvroJob.setOutputKeySchema(job, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT))); job.setOutputValueClass(NullWritable.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setSortComparatorClass(Text.Comparator.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); return 0; }