List of usage examples for org.apache.hadoop.mapreduce Job setGroupingComparatorClass
public void setGroupingComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException
From source file:org.chombo.mr.MultiJoiner.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "MultiJoiner MR"; job.setJobName(jobName);/*from w ww . ja v a 2 s .c om*/ job.setJarByClass(MultiJoiner.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); Utility.setConfiguration(job.getConfiguration()); job.setMapperClass(MultiJoiner.JoinerMapper.class); job.setReducerClass(MultiJoiner.JoinerReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); int numReducer = job.getConfiguration().getInt("muj.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:org.chombo.mr.OutlierBasedDataValidation.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Detecting invalid data as outliers"; job.setJobName(jobName);// w w w . j a v a 2s . c om job.setJarByClass(OutlierBasedDataValidation.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); Utility.setConfiguration(job.getConfiguration(), "chombo"); job.setMapperClass(OutlierBasedDataValidation.DataValidatorMapper.class); job.setReducerClass(OutlierBasedDataValidation.DataValidatorReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); int numReducer = job.getConfiguration().getInt("obdv.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:org.chombo.mr.Projection.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Projection and grouping MR"; job.setJobName(jobName);//from w ww . j a v a 2s . co m job.setJarByClass(Projection.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); Utility.setConfiguration(job.getConfiguration()); String operation = job.getConfiguration().get("projection.operation", "project"); if (operation.startsWith("grouping")) { //group by job.setMapperClass(Projection.ProjectionMapper.class); job.setReducerClass(Projection.ProjectionReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Text.class); int numReducer = job.getConfiguration().getInt("pro.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); //order by boolean doOrderBy = job.getConfiguration().getInt("orderBy.field", -1) >= 0; if (doOrderBy) { job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TupleTextPartitioner.class); } } else { //simple projection job.setMapperClass(Projection.SimpleProjectionMapper.class); } job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:org.chombo.mr.RecordSetBulkMutator.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "record set mutator MR"; job.setJobName(jobName);//from w w w . j a v a 2 s. c o m job.setJarByClass(RecordSetBulkMutator.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(RecordSetBulkMutator.BulkMutatorMapper.class); job.setReducerClass(RecordSetBulkMutator.BulkMutatorReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); Utility.setConfiguration(job.getConfiguration()); int numReducer = job.getConfiguration().getInt("rsb.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:org.chombo.mr.RecordSetModifier.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "record set modifier MR"; job.setJobName(jobName);//from w ww .ja v a2 s . c o m job.setJarByClass(RecordSetModifier.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(RecordSetModifier.ModifierMapper.class); job.setReducerClass(RecordSetModifier.ModifierReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); Utility.setConfiguration(job.getConfiguration()); int numReducer = job.getConfiguration().getInt("rsm.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:org.chombo.mr.TimeGapSequenceGenerator.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Time sequence to time gap sequence conversion"; job.setJobName(jobName);//from ww w .java2s .c o m job.setJarByClass(TimeGapSequenceGenerator.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); Utility.setConfiguration(job.getConfiguration(), "chombo", true); job.setMapperClass(TimeGapSequenceGenerator.TimeGapMapper.class); job.setReducerClass(TimeGapSequenceGenerator.TimeGapReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); int numReducer = job.getConfiguration().getInt("tgs.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:org.chombo.mr.TimeSequenceFilter.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Time sequence to time gap sequence conversion"; job.setJobName(jobName);/*from w w w .ja v a 2s .c o m*/ job.setJarByClass(TimeSequenceFilter.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); Utility.setConfiguration(job.getConfiguration(), "chombo"); job.setMapperClass(TimeGapSequenceGenerator.TimeGapMapper.class); job.setReducerClass(TimeSequenceFilter.FilterReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); int numReducer = job.getConfiguration().getInt("tsf.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:org.chombo.mr.WeightedAverage.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Weighted average calculating MR"; job.setJobName(jobName);/*from w ww. j av a2s . c om*/ job.setJarByClass(WeightedAverage.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(WeightedAverage.AverageMapper.class); job.setReducerClass(WeightedAverage.AverageReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Tuple.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); Utility.setConfiguration(job.getConfiguration()); if (job.getConfiguration().getInt("group.by.field", -1) >= 0) { //group by job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TuplePairPartitioner.class); } int numReducer = job.getConfiguration().getInt("wea.num.reducer", -1); numReducer = -1 == numReducer ? job.getConfiguration().getInt("num.reducer", 1) : numReducer; job.setNumReduceTasks(numReducer); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:org.freeeed.mr.FreeEedMR.java
License:Apache License
@Override public int run(String[] args) throws Exception { // inventory dir holds all package (zip) files resulting from stage String projectFileName = args[0]; String outputPath = args[1];/* w w w . j a va2s . co m*/ LOGGER.info("Running Hadoop job"); LOGGER.info("Input project file = " + projectFileName); LOGGER.info("Output path = " + outputPath); Stats.getInstance().setNumberMappers(projectFileName); ESIndex.getInstance().init(); // Hadoop configuration class Configuration configuration = getConf(); // No speculative execution! Do not process the same file twice configuration.set("mapred.reduce.tasks.speculative.execution", "false"); // TODO even in local mode, the first argument should not be the inventory // but write a complete project file instead Project project = Project.getCurrentProject(); if (project == null || project.isEmpty()) { // configure Hadoop input files System.out.println("Reading project file " + projectFileName); project = Project.loadFromFile(new File(projectFileName)); } project.setProperty(ParameterProcessing.OUTPUT_DIR_HADOOP, outputPath); // send complete project information to all mappers and reducers configuration.set(ParameterProcessing.PROJECT, project.toString()); Settings.load(); configuration.set(ParameterProcessing.SETTINGS_STR, Settings.getSettings().toString()); configuration.set(EmailProperties.PROPERTIES_FILE, Files.toString(new File(EmailProperties.PROPERTIES_FILE), Charset.defaultCharset())); Job job = new Job(configuration); job.setJarByClass(FreeEedMR.class); job.setJobName("FreeEedMR"); // Hadoop processes key-value pairs // job.setOutputKeyClass(Text.class); // job.setOutputValueClass(MapWritable.class); // set map and reduce classes job.setMapperClass(FreeEedMapper.class); job.setInputFormatClass(NLineInputFormat.class); job.setNumReduceTasks(0); // secondary sort for compound keys - this sorts the attachments job.setSortComparatorClass(KeyComparator.class); job.setGroupingComparatorClass(GroupComparator.class); // Hadoop TextInputFormat class // job.setInputFormatClass(TextInputFormat.class); // job.setOutputFormatClass(TextOutputFormat.class); LOGGER.debug("project.isEnvHadoop() = {} ", project.isEnvHadoop()); String inputPath = projectFileName; if (project.isEnvHadoop() || Settings.getSettings().isHadoopDebug()) { inputPath = formInputPath(project); } LOGGER.debug("Ready to run, inputPath = {}, outputPath = {}", inputPath, outputPath); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); if (Settings.getSettings().isHadoopDebug()) { if (new File(outputPath).exists()) { Util.deleteDirectory(new File(outputPath)); } } LOGGER.trace("Project"); LOGGER.trace(project.toString()); boolean success = job.waitForCompletion(true); ESIndex.getInstance().destroy(); if (project.isEnvHadoop() && project.isFsS3()) { transferResultsToS3(outputPath); } return success ? 0 : 1; }
From source file:org.imageterrier.indexers.hadoop.HadoopIndexer.java
License:Mozilla Public License
protected Job createJob(HadoopIndexerOptions options) throws IOException { final Job job = new Job(getConf()); job.setJobName("terrierIndexing"); if (options.getInputMode() == InputMode.QUANTISED_FEATURES) { job.setMapperClass(QFIndexerMapper.class); } else {/*from w ww . j av a 2s. c o m*/ if (options.shardPerThread) { job.setMapperClass(MultithreadedMapper.class); MultithreadedMapper.setMapperClass(job, MTImageIndexerMapper.class); MultithreadedMapper.setNumberOfThreads(job, options.getMultithread()); } else { job.setMapperClass(ImageIndexerMapper.class); } } // Load quantiser (if it exists), extract header, count codebook size if (options.getInputModeOptions().hasQuantiserFile()) { final String quantFile = options.getInputModeOptions().getQuantiserFile(); System.out.println("Loading codebook to see its size"); final SpatialClusters<?> quantiser = readClusters(options); System.out.println("Setting codebook size: " + quantiser.numClusters()); job.getConfiguration().setInt(QUANTISER_SIZE, quantiser.numClusters()); if (quantiser.numClusters() < options.getNumReducers()) options.setNumReducers(quantiser.numClusters()); } job.setReducerClass(IndexerReducer.class); FileOutputFormat.setOutputPath(job, options.getOutputPath()); job.setMapOutputKeyClass(NewSplitEmittedTerm.class); job.setMapOutputValueClass(MapEmittedPostingList.class); job.getConfiguration().setBoolean("indexing.hadoop.multiple.indices", options.isDocumentPartitionMode()); // if // (!job.getConfiguration().get("mapred.job.tracker").equals("local")) { // job.getConfiguration().set("mapred.map.output.compression.codec", // GzipCodec.class.getCanonicalName()); // job.getConfiguration().setBoolean("mapred.compress.map.output", // true); // } else { job.getConfiguration().setBoolean("mapred.compress.map.output", false); // } job.setInputFormatClass(PositionAwareSequenceFileInputFormat.class); // important job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setSortComparatorClass(NewSplitEmittedTerm.SETRawComparatorTermSplitFlush.class); job.setGroupingComparatorClass(NewSplitEmittedTerm.SETRawComparatorTerm.class); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); SequenceFileInputFormat.setInputPaths(job, options.getInputPaths()); job.setNumReduceTasks(options.getNumReducers()); if (options.getNumReducers() > 1) { if (options.isDocumentPartitionMode()) { job.setPartitionerClass(NewSplitEmittedTerm.SETPartitioner.class); } else { // job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class); if (job.getConfiguration().getInt(QUANTISER_SIZE, -1) == -1) { job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerHashedTerm.class); } else { job.setPartitionerClass(NewSplitEmittedTerm.SETPartitionerCodebookAwareTerm.class); } } } else { // for JUnit tests, we seem to need to restore the original // partitioner class job.setPartitionerClass(HashPartitioner.class); } job.setJarByClass(this.getClass()); return job; }