List of usage examples for org.apache.hadoop.mapreduce Job setSortComparatorClass
public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException
From source file:ph.fingra.hadoop.mapred.parts.performance.UserSessionStatistic.java
License:Apache License
public Job createJob(Configuration conf, Path[] inputpaths, Path outputpath, int numreduce, FingraphConfig finconfig) throws IOException { conf.setBoolean("verbose", finconfig.getDebug().isDebug_show_verbose()); conf.setBoolean("counter", finconfig.getDebug().isDebug_show_counter()); Job job = new Job(conf); String jobName = "perform/usersession job"; job.setJobName(jobName);/*from www.j a va2s.co m*/ job.setJarByClass(UserSessionStatistic.class); for (int i = 0; i < inputpaths.length; i++) { FileInputFormat.addInputPath(job, inputpaths[i]); } FileOutputFormat.setOutputPath(job, outputpath); job.setMapperClass(UserSessionMapper.class); job.setReducerClass(UserSessionReducer.class); job.setMapOutputKeyClass(UserSessionKey.class); job.setMapOutputValueClass(UserSessionEntity.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setPartitionerClass(UserSessionPartitioner.class); job.setSortComparatorClass(UserSessionSortComparator.class); job.setGroupingComparatorClass(UserSessionGroupComparator.class); job.setNumReduceTasks(numreduce); return job; }
From source file:ph.fingra.hadoop.mapred.parts.performance.UserSessionStatistic.java
License:Apache License
public Job createHourJob(Configuration conf, Path[] inputpaths, Path outputpath, int numreduce, FingraphConfig finconfig, TargetDate targetdate) throws IOException { conf.setBoolean("verbose", finconfig.getDebug().isDebug_show_verbose()); conf.setBoolean("counter", finconfig.getDebug().isDebug_show_counter()); conf.set("hour", targetdate.getHour()); Job job = new Job(conf); String jobName = "perform/usersession hour job"; job.setJobName(jobName);/*from w ww . j av a 2 s .c om*/ job.setJarByClass(UserSessionStatistic.class); for (int i = 0; i < inputpaths.length; i++) { FileInputFormat.addInputPath(job, inputpaths[i]); } FileOutputFormat.setOutputPath(job, outputpath); job.setMapperClass(UserSessionHourMapper.class); job.setReducerClass(UserSessionHourReducer.class); job.setMapOutputKeyClass(UserSessionHourKey.class); job.setMapOutputValueClass(UserSessionHourEntity.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setPartitionerClass(UserSessionHourPartitioner.class); job.setSortComparatorClass(UserSessionHourSortComparator.class); job.setGroupingComparatorClass(UserSessionHourGroupComparator.class); job.setNumReduceTasks(numreduce); return job; }
From source file:ph.fingra.hadoop.mapred.parts.prerole.PreTransform.java
License:Apache License
public Job createJob(Configuration conf, Path[] inputpaths, Path outputpath, int numreduce, FingraphConfig finconfig) throws IOException { conf.setBoolean("verbose", finconfig.getDebug().isDebug_show_verbose()); conf.setBoolean("counter", finconfig.getDebug().isDebug_show_counter()); Job job = new Job(conf); String jobName = "prerole/pretransform job"; job.setJobName(jobName);//from w w w . j a v a 2s. c o m job.setJarByClass(PreTransform.class); for (int i = 0; i < inputpaths.length; i++) { FileInputFormat.addInputPath(job, inputpaths[i]); } FileOutputFormat.setOutputPath(job, outputpath); job.setMapperClass(PreTransformMapper.class); job.setReducerClass(PreTransformReducer.class); job.setMapOutputKeyClass(TransformKey.class); job.setMapOutputValueClass(TransformContainer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setPartitionerClass(PreTransformPartitioner.class); job.setSortComparatorClass(PreTransformSortComparator.class); job.setGroupingComparatorClass(PreTransformGroupComparator.class); job.setNumReduceTasks(numreduce); return job; }
From source file:SecondarySort.HashToAlternateWithSS.java
protected Job jobConfig() throws IOException { JobConf conf = new JobConf(); Job job = new Job(conf, "iteration"); job.setJarByClass(HashToAlternateWithSS.class); job.setReducerClass(ReduceSS.class); job.setPartitionerClass(LongPair.HPartitioner.class); job.setSortComparatorClass(LongPair.Comparator.class); job.setGroupingComparatorClass(LongPair.GroupComparator.class); job.setOutputKeyClass(LongPair.class); job.setOutputValueClass(Text.class); return job;/*from ww w.j a v a 2s. c o m*/ }
From source file:SecondarySort.HashToMinWithSS.java
protected Job jobConfig() throws IOException { JobConf conf = new JobConf(); Job job = new Job(conf, "iteration"); job.setJarByClass(HashToMinWithSS.class); job.setReducerClass(ReduceSS.class); job.setPartitionerClass(LongPair.HPartitioner.class); job.setSortComparatorClass(LongPair.Comparator.class); job.setGroupingComparatorClass(LongPair.GroupComparator.class); job.setOutputKeyClass(LongPair.class); job.setOutputValueClass(Text.class); return job;/*ww w . j a v a 2s. com*/ }
From source file:SecondarySort.inputMaker.java
License:Open Source License
protected Job jobConfig() throws IOException { JobConf conf = new JobConf(); Job job = new Job(conf, "iteration"); job.setJarByClass(inputMaker.class); job.setMapperClass(MapMSS.class); job.setReducerClass(ReduceSS.class); job.setPartitionerClass(LongPair.HPartitioner.class); job.setSortComparatorClass(LongPair.Comparator.class); job.setGroupingComparatorClass(LongPair.GroupComparator.class); job.setOutputKeyClass(LongPair.class); job.setOutputValueClass(Text.class); return job;//from w w w . j a va 2 s . c o m }
From source file:simsql.runtime.MRLoader.java
License:Apache License
public long run(String inputPath, String outputPath, short typeCode, Relation r, int sortAtt) { // make a directory for the relation Configuration conf = new Configuration(); FileSystem dfs = null;/* ww w.ja v a 2 s . c o m*/ try { dfs = FileSystem.get(conf); } catch (Exception e) { throw new RuntimeException("Cannot access HDFS!", e); } try { // if it exists, destroy it. Path path = new Path(outputPath); if (dfs.exists(path)) { dfs.delete(path, true); } } catch (Exception e) { throw new RuntimeException("Could not create the file to bulk load to!", e); } // find a file name String tempPath = null; if (inputPath.startsWith("hdfs:")) { tempPath = inputPath.replace("hdfs:", ""); } else { tempPath = "/tempDataFile_" + r.getName(); try { dfs.delete(new Path(tempPath), true); } catch (Exception e) { // ignore this. } // upload the text file try { dfs.copyFromLocalFile(false, true, new Path(inputPath), new Path(tempPath)); dfs.deleteOnExit(new Path(tempPath)); } catch (Exception e) { throw new RuntimeException("Failed to upload text file " + inputPath + " to HDFS!", e); } } // set up the new job's parameters. conf.setBoolean("mapred.compress.map.output", true); conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass()); conf.set("io.serializations", "simsql.runtime.RecordSerialization,simsql.runtime.RecordKeySerialization,org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt("simsql.loader.numAtts", r.getAttributes().size()); conf.setInt("simsql.loader.typeCode", (int) typeCode); conf.setInt("simsql.loader.sortAtt", sortAtt); String[] myStrings = new String[r.getAttributes().size()]; int j = 0; for (simsql.compiler.Attribute a : r.getAttributes()) { myStrings[j++] = a.getPhysicalRealization().getClass().getName(); } conf.setStrings("simsql.loader.types", myStrings); // create a job Job job; try { job = new Job(conf); } catch (Exception e) { throw new RuntimeException("Unable to create bulk loading job!", e); } // set the split size (number of mappers) long fSize = 0; if (inputPath.startsWith("hdfs")) { fSize = RelOp.getPathsTotalSize(new String[] { tempPath }); } else { fSize = new File(inputPath).length(); } FileInputFormat.setMinInputSplitSize(job, fSize / (long) numTasks); FileInputFormat.setMaxInputSplitSize(job, fSize / (long) numTasks); // and the number of reducers job.setNumReduceTasks(numTasks); // the mapper/reducer/jar job.setMapperClass(MRLoaderMapper.class); job.setReducerClass(MRLoaderReducer.class); job.setJarByClass(MRLoader.class); // I/O settings. job.setOutputFormatClass(RecordOutputFormat.class); job.setMapOutputKeyClass(RecordKey.class); job.setMapOutputValueClass(RecordWrapper.class); job.setOutputKeyClass(Nothing.class); job.setOutputValueClass(Record.class); try { FileInputFormat.setInputPaths(job, new Path(tempPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); } catch (Exception e) { throw new RuntimeException("Could not set job inputs/outputs", e); } job.setGroupingComparatorClass(RecordKeyGroupingComparator.class); job.setPartitionerClass(RecordPartitioner.class); job.setSortComparatorClass(RecordKeySortComparator.class); job.setJobName("MRLoader: " + inputPath + " ==> " + outputPath); // run it Counters counters; try { job.waitForCompletion(true); counters = job.getCounters(); } catch (Exception e) { throw new RuntimeException("Could not set up bulk loader job!", e); } // now, delete all the empty part files try { // get a filesystem FileSystem ddfs = FileSystem.get(conf); Path outPath = new Path(outputPath); if (ddfs.exists(outPath) && ddfs.isDirectory(outPath)) { FileStatus fstatus[] = ddfs.listStatus(outPath, new TableFileFilter()); for (FileStatus ff : fstatus) { if (ddfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around... ddfs.delete(ff.getPath(), true); } } } } catch (Exception e) { // this isn't disastrous } // get the counter for the output of the mapper. Counter bytesCounter = counters.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN); return bytesCounter.getValue(); }
From source file:simsql.runtime.RelOp.java
License:Apache License
public boolean run(RuntimeParameter params, boolean verbose) { ExampleRuntimeParameter pp = (ExampleRuntimeParameter) params; // build the jar. String jarFile = buildJarFile(params); // Get the default configuration object Configuration conf = new Configuration(); // set quite mode on/off conf.setQuietMode(!verbose);//from w w w.ja v a 2 s. c o m /*** conf.setBoolean("mapred.task.profile", true); conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," + "heap=sites,depth=8,force=n,thread=y,verbose=n,file=%s"); ***/ // tell it how to serialize and deserialize records and recordkeys conf.set("io.serializations", getSerializations()); conf.setBoolean("mapred.compress.map.output", true); int ioSortMB = conf.getInt("io.sort.mb", 256); conf.set("mapred.map.child.java.opts", "-Xmx" + (getMemPerMapper(params) + ioSortMB) + "m -Xms" + (getMemPerMapper(params)) + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log"); conf.set("mapred.reduce.child.java.opts", "-Xmx" + (getMemPerReducer(params) + ioSortMB) + "m -Xms" + (getMemPerMapper(params)) + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log"); conf.setInt("simsql.input.numSplits", pp.getNumCPUs()); conf.setInt("mapred.job.reuse.jvm.num.tasks", 1); // conf.setBoolean ("mapred.map.tasks.speculative.execution", false); // conf.setBoolean ("mapred.reduce.tasks.speculative.execution", false); // tell it to use the jar that we just created conf.set("mapred.jar", jarFile); // conf.set("tmpjars", "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar"); conf.setBoolean("mapred.output.compress", true); conf.setStrings("mapred.output.compression.type", new String[] { "RECORD" }); // use snappy for the intermediate stuff conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass()); // do some additional operator-specific configurations setConfigurations(conf, params); // collect statistics for final relations always conf.setBoolean("simsql.collectStats", isFinal || collectStats); // figure out what file to map String[] inDirs = myInputNetwork.getPipelinedInputFiles(); inDirs = excludeAnyWhoWillNotBeMapped(inDirs); String inSingleString = inDirs[0]; conf.set("simsql.fileToMap", inSingleString); for (int i = 1; i < inDirs.length; i++) { inSingleString += "," + inDirs[i]; } // create and name the job Job job; try { job = new Job(conf); } catch (Exception e) { throw new RuntimeException("Unable to create a new job!", e); } job.setJobName(getJobName()); // set the map-reduce input and output types job.setMapOutputKeyClass(getMapOutputKeyClass()); job.setMapOutputValueClass(getMapOutputValueClass()); job.setOutputKeyClass(getOutputKeyClass()); job.setOutputValueClass(getOutputValueClass()); int numReducers = getNumReducers(params); job.setMapperClass(getMapperClass()); job.setReducerClass(getReducerClass()); // set the number of reducers job.setNumReduceTasks(numReducers); // set the input and the output formats... these extend FileInputFormat and FileOutputFormat job.setInputFormatClass(getInputFormatClass()); job.setOutputFormatClass(getOutputFormatClass()); // set the input and output paths try { System.out.println("input file: " + inSingleString); FileInputFormat.setInputPaths(job, inSingleString); FileInputFormat.setInputPathFilter(job, TableFileFilter.class); FileOutputFormat.setOutputPath(job, new Path(getOutput())); } catch (Exception e) { throw new RuntimeException("Unable to set up the input/output path for the job.", e); } // set the split size FileInputFormat.setMinInputSplitSize(job, getSplitSize(params)); FileInputFormat.setMaxInputSplitSize(job, getSplitSize(params)); // set the various sorting/grouping/mapping classes job.setGroupingComparatorClass(getGroupingComparatorClass()); job.setPartitionerClass(getPartitionerClass()); job.setSortComparatorClass(getSortComparatorClass()); // and now, submit the job and wait for things to finish int exitCode; try { exitCode = job.waitForCompletion(verbose) ? 0 : 1; // get the output bytes counter. Counters c = job.getCounters(); Counter mx = c.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN); // and use them to set the size of the output relation. if (myDB != null) { myDB.setTableSize(myDB.getTableName(getOutput()), mx.getValue()); myDB.setNumAtts(myDB.getTableName(getOutput()), getOutputAttNames().length); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("Unable to run the job", e); } // now, delete all the empty part files try { // get a filesystem FileSystem dfs = FileSystem.get(conf); Path outPath = new Path(getOutput()); if (dfs.exists(outPath) && dfs.isDirectory(outPath)) { FileStatus fstatus[] = dfs.listStatus(outPath, new TableFileFilter()); for (FileStatus ff : fstatus) { if (dfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around... dfs.delete(ff.getPath(), true); } } } } catch (Exception e) { // this isn't disastrous } return (exitCode == 0); }
From source file:top10flight.Top25MoviesChaining.java
/** * @param args the command line arguments *//*from w ww . j a v a 2 s.co m*/ public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { //job 1 Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "Job: 1, top 25 movie based on ratings"); job.setJarByClass(Top25MoviesChaining.class); job.setMapperClass(FirstMapper.class); job.setReducerClass(FirstReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(job, new Path(args[1])); Path firstJobOutput = new Path(args[2]); FileOutputFormat.setOutputPath(job, firstJobOutput); job.waitForCompletion(true); //job 2 Job job2 = Job.getInstance(conf, "Job: 2, top 25 movie based on ratings"); job2.setJarByClass(Top25MoviesChaining.class); job2.setMapperClass(SecondMapper.class); job2.setReducerClass(SecondReducer.class); job2.setSortComparatorClass(SortKeyComparator.class); job2.setMapOutputKeyClass(DoubleWritable.class); job2.setMapOutputValueClass(Text.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(DoubleWritable.class); FileInputFormat.addInputPath(job2, firstJobOutput); String timeStamp = new SimpleDateFormat("yyyy.MM.dd.HH.mm.ss", Locale.US) .format(new Timestamp(System.currentTimeMillis())); FileOutputFormat.setOutputPath(job2, new Path(args[2] + timeStamp)); System.exit(job2.waitForCompletion(true) ? 0 : 1); }
From source file:top10_categories.Top10_Categories.java
/** * @param args the command line arguments *//* w ww .ja v a 2s. co m*/ public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf1 = new Configuration(); Configuration conf = new Configuration(); Path inputDir = new Path(args[0]); Path hdfsFile = new Path(args[1]); FileSystem hdfs = FileSystem.get(conf); FileSystem local = FileSystem.getLocal(conf); try { FileStatus[] inputFiles = local.listStatus(inputDir); FSDataOutputStream out = hdfs.create(hdfsFile); for (int i = 0; i < inputFiles.length; i++) { System.out.println(inputFiles[i].getPath().getName()); FSDataInputStream in = local.open(inputFiles[i].getPath()); byte[] buffer = new byte[256]; int bytesRead = 0; while ((bytesRead = in.read(buffer)) > 0) { out.write(buffer, 0, bytesRead); } in.close(); } out.close(); } catch (IOException e) { e.printStackTrace(); } Job job1 = Job.getInstance(conf1, "Chaining"); job1.setJarByClass(Top10_Categories.class); job1.setMapperClass(Map1.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(FloatWritable.class); job1.setReducerClass(Reduce1.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(DoubleWritable.class); job1.setCombinerClass(Reduce1.class); FileInputFormat.addInputPath(job1, hdfsFile); FileOutputFormat.setOutputPath(job1, new Path(args[2])); boolean complete = job1.waitForCompletion(true); Configuration conf2 = new Configuration(); Job job2 = Job.getInstance(conf2, "Chaining"); if (complete) { job2.setJarByClass(Top10_Categories.class); job2.setMapperClass(Map2.class); job2.setMapOutputKeyClass(FloatWritable.class); job2.setMapOutputValueClass(Text.class); job2.setReducerClass(Reduce2.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(FloatWritable.class); job2.setSortComparatorClass(SortKeyComparator.class); job2.setNumReduceTasks(1); FileInputFormat.addInputPath(job2, new Path(args[2])); FileOutputFormat.setOutputPath(job2, new Path(args[3])); System.exit(job2.waitForCompletion(true) ? 0 : 1); } }