List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:com.mongodb.hadoop.examples.wordcount.split.WordCountSplitTest.java
License:Apache License
@Override public int run(String[] args) throws Exception { final Configuration conf = getConf(); boolean useQuery = false; for (int i = 0; i < args.length; i++) { final String argi = args[i]; if (argi.equals("--use-query")) useQuery = true;/*from www .j a va 2 s. c o m*/ else { throw new IllegalArgumentException(argi); } } if (useQuery) { //NOTE: must do this BEFORE Job is created final MongoConfig mongo_conf = new MongoConfig(conf); com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject(); query.put("num", new com.mongodb.BasicDBObject(Collections.singletonMap("$mod", new int[] { 2, 0 }))); System.out.println(" --- setting query on num"); mongo_conf.setQuery(query); System.out.println(" --- query is: " + mongo_conf.getQuery()); } final com.mongodb.MongoURI outputUri = MongoConfigUtil.getOutputURI(conf); if (outputUri == null) throw new IllegalStateException("output uri is not set"); if (MongoConfigUtil.getInputURI(conf) == null) throw new IllegalStateException("input uri is not set"); final String outputCollectionName = outputUri.getCollection(); final Job job = new Job(conf, "word count " + outputCollectionName); job.setJarByClass(WordCountSplitTest.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(MongoInputFormat.class); job.setOutputFormatClass(MongoOutputFormat.class); final long start = System.currentTimeMillis(); System.out .println(" ----------------------- running test " + outputCollectionName + " --------------------"); try { boolean result = job.waitForCompletion(true); System.out.println("job.waitForCompletion( true ) returned " + result); } catch (Exception e) { System.err.println("job.waitForCompletion( true ) threw Exception"); e.printStackTrace(); return 1; } final long end = System.currentTimeMillis(); final float seconds = ((float) (end - start)) / 1000; java.text.NumberFormat nf = java.text.NumberFormat.getInstance(); nf.setMaximumFractionDigits(3); System.out.println("finished run in " + nf.format(seconds) + " seconds"); com.mongodb.Mongo m = new com.mongodb.Mongo(outputUri); com.mongodb.DB db = m.getDB(outputUri.getDatabase()); com.mongodb.DBCollection coll = db.getCollection(outputCollectionName); com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject(); query.put("_id", "the"); com.mongodb.DBCursor cur = coll.find(query); if (!cur.hasNext()) System.out.println("FAILURE: could not find count of \'the\'"); else System.out.println("'the' count: " + cur.next()); return 0; //is the return value supposed to be the program exit code? // if (! result) // System.exit( 1 ); }
From source file:com.mongodb.hadoop.examples.wordcount.split.WordCountSplitTest.java
License:Apache License
private final static void test(boolean useShards, boolean useChunks, Boolean slaveok, boolean useQuery) throws Exception { final Configuration conf = new Configuration(); MongoConfigUtil.setInputURI(conf, "mongodb://localhost:30000/test.lines"); conf.setBoolean(MongoConfigUtil.SPLITS_USE_SHARDS, useShards); conf.setBoolean(MongoConfigUtil.SPLITS_USE_CHUNKS, useChunks); if (useQuery) { //NOTE: must do this BEFORE Job is created final MongoConfig mongo_conf = new MongoConfig(conf); com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject(); query.put("num", new com.mongodb.BasicDBObject(Collections.singletonMap("$mod", new int[] { 2, 0 }))); System.out.println(" --- setting query on num"); mongo_conf.setQuery(query);/*from w w w .java 2 s . co m*/ System.out.println(" --- query is: " + mongo_conf.getQuery()); } String output_table = null; if (useChunks) { if (useShards) output_table = "with_shards_and_chunks"; else output_table = "with_chunks"; } else { if (useShards) output_table = "with_shards"; else output_table = "no_splits"; } if (slaveok != null) { output_table += "_" + slaveok; } MongoConfigUtil.setOutputURI(conf, "mongodb://localhost:30000/test." + output_table); System.out.println("Conf: " + conf); final Job job = new Job(conf, "word count " + output_table); job.setJarByClass(WordCountSplitTest.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(MongoInputFormat.class); job.setOutputFormatClass(MongoOutputFormat.class); final long start = System.currentTimeMillis(); System.out.println(" ----------------------- running test " + output_table + " --------------------"); try { boolean result = job.waitForCompletion(true); System.out.println("job.waitForCompletion( true ) returned " + result); } catch (Exception e) { System.out.println("job.waitForCompletion( true ) threw Exception"); e.printStackTrace(); } final long end = System.currentTimeMillis(); final float seconds = ((float) (end - start)) / 1000; java.text.NumberFormat nf = java.text.NumberFormat.getInstance(); nf.setMaximumFractionDigits(3); System.out.println("finished run in " + nf.format(seconds) + " seconds"); com.mongodb.Mongo m = new com.mongodb.Mongo( new com.mongodb.MongoURI("mongodb://localhost:30000/?slaveok=true")); com.mongodb.DB db = m.getDB("test"); com.mongodb.DBCollection coll = db.getCollection(output_table); com.mongodb.BasicDBObject query = new com.mongodb.BasicDBObject(); query.put("_id", "the"); com.mongodb.DBCursor cur = coll.find(query); if (!cur.hasNext()) System.out.println("FAILURE: could not find count of \'the\'"); else System.out.println("'the' count: " + cur.next()); // if (! result) // System.exit( 1 ); }
From source file:com.mongodb.hadoop.examples.wordcount.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); MongoConfigUtil.setInputURI(conf, "mongodb://localhost/test.in"); MongoConfigUtil.setOutputURI(conf, "mongodb://localhost/test.out"); System.out.println("Conf: " + conf); final Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(MongoInputFormat.class); job.setOutputFormatClass(MongoOutputFormat.class); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.mongodb.hadoop.util.MongoTool.java
License:Apache License
private int runMapReduceJob(final Configuration conf) throws IOException { final Job job = Job.getInstance(conf, getJobName()); /**//from w w w . j a v a2s . c om * Any arguments specified with -D <property>=<value> * on the CLI will be picked up and set here * They override any XML level values * Note that -D<space> is important - no space will * not work as it gets picked up by Java itself */ // TODO - Do we need to set job name somehow more specifically? // This may or may not be correct/sane job.setJarByClass(getClass()); final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf); LOG.debug("Mapper Class: " + mapper); LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI)); job.setMapperClass(mapper); Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf); if (combiner != null) { job.setCombinerClass(combiner); } job.setReducerClass(MongoConfigUtil.getReducer(conf)); job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf)); job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf)); job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf)); job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf)); Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf); Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf); if (mapOutputKeyClass != null) { job.setMapOutputKeyClass(mapOutputKeyClass); } if (mapOutputValueClass != null) { job.setMapOutputValueClass(mapOutputValueClass); } /** * Determines if the job will run verbosely e.g. print debug output * Only works with foreground jobs */ final boolean verbose = MongoConfigUtil.isJobVerbose(conf); /** * Run job in foreground aka wait for completion or background? */ final boolean background = MongoConfigUtil.isJobBackground(conf); try { if (background) { LOG.info("Setting up and running MapReduce job in background."); job.submit(); return 0; } else { LOG.info("Setting up and running MapReduce job in foreground, will wait for results. {Verbose? " + verbose + "}"); return job.waitForCompletion(true) ? 0 : 1; } } catch (final Exception e) { LOG.error("Exception while executing job... ", e); return 1; } }
From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java
License:Apache License
/** * Configures the MapReduce combiner for the job. * * @param job The Hadoop MR job./*from w w w .j a v a 2s .co m*/ */ protected void configureCombiner(Job job) { final FijiReducer<?, ?, ?, ?> combiner = getCombiner(); if (null == combiner) { LOG.debug("No combiner provided."); return; } if (combiner instanceof Configurable) { ((Configurable) combiner).setConf(job.getConfiguration()); } job.setCombinerClass(((Reducer<?, ?, ?, ?>) combiner).getClass()); }
From source file:com.mozilla.socorro.hadoop.CrashReportJob.java
License:LGPL
/** * @param args/*from w ww. j av a2 s . co m*/ * @return * @throws IOException * @throws ParseException */ public static Job initJob(String jobName, Configuration conf, Class<?> mainClass, Class<? extends TableMapper> mapperClass, Class<? extends Reducer> combinerClass, Class<? extends Reducer> reducerClass, Map<byte[], byte[]> columns, Class<? extends WritableComparable> keyOut, Class<? extends Writable> valueOut, Path outputPath) throws IOException, ParseException { // Set both start/end time and start/stop row Calendar startCal = Calendar.getInstance(); Calendar endCal = Calendar.getInstance(); SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); String startDateStr = conf.get(START_DATE); String endDateStr = conf.get(END_DATE); if (!StringUtils.isBlank(startDateStr)) { startCal.setTime(sdf.parse(startDateStr)); } if (!StringUtils.isBlank(endDateStr)) { endCal.setTime(sdf.parse(endDateStr)); } conf.setLong(START_TIME, startCal.getTimeInMillis()); conf.setLong(END_TIME, DateUtil.getEndTimeAtResolution(endCal.getTimeInMillis(), Calendar.DATE)); Job job = new Job(conf); job.setJobName(jobName); job.setJarByClass(mainClass); // input table configuration Scan[] scans = MultiScanTableMapReduceUtil.generateScans(startCal, endCal, columns, 100, false); MultiScanTableMapReduceUtil.initMultiScanTableMapperJob(TABLE_NAME_CRASH_REPORTS, scans, mapperClass, keyOut, valueOut, job); if (combinerClass != null) { job.setCombinerClass(combinerClass); } if (reducerClass != null) { job.setReducerClass(reducerClass); } else { job.setNumReduceTasks(0); } FileOutputFormat.setOutputPath(job, outputPath); return job; }
From source file:com.mvdb.platform.scratch.action.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { logger.error("error1"); logger.warn("warning1"); logger.info("info1"); logger.debug("debug1"); logger.trace("trace1"); ILoggerFactory lc = LoggerFactory.getILoggerFactory(); System.err.println("lc:" + lc); // print logback's internal status //StatusPrinter.print(lc); Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs != null) { for (String arg : otherArgs) { System.out.println(arg); }/*from ww w . j av a 2 s . co m*/ } if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2); } Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.mycompany.keywordsearch.KeywordSearch.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, String.valueOf(true)); Path input = new Path(args[0]); Path output = new Path(args[1]); BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); System.out.print("Keyword:\t"); conf.set(KEYWORD, in.readLine());/*from ww w . ja v a 2 s . co m*/ Job job = Job.getInstance(conf, "word count"); job.setJarByClass(KeywordSearch.class); job.setInputFormatClass(TextInputFormatV2.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); clearOutput(conf, output); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.mycompany.searcher.Searcher.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); System.out.print("Please input a keyword:\t"); conf.set(KEYWORD, in.readLine());//from ww w. j a v a2s. c om conf.set(MINIMUM, args[2]); Job job = Job.getInstance(conf, "keyword search"); job.setJarByClass(Searcher.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); clearOutput(conf, new Path(args[1])); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.waitForCompletion(true); TimeUnit.SECONDS.sleep(1); in = new BufferedReader( new InputStreamReader(FileSystem.get(conf).open(new Path(args[1] + "/part-r-00000")), "UTF-8")); String line; HashMap<String, Integer> map = new HashMap(); while ((line = in.readLine()) != null) { StringTokenizer tok = new StringTokenizer(line); map.put(tok.nextToken(), new Integer(tok.nextToken())); } List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(map.entrySet()); Collections.sort(list, new Comparator<Map.Entry<String, Integer>>() { public int compare(Map.Entry<String, Integer> entry1, Map.Entry<String, Integer> entry2) { return (entry2.getValue() - entry1.getValue()); } }); for (Map.Entry<String, Integer> entry : list) { in = new BufferedReader( new InputStreamReader(FileSystem.get(conf).open(new Path(entry.getKey())), "UTF-8")); System.out.println("\n" + in.readLine()); System.out.println("\n" + in.readLine() + ":" + entry.getValue() + "\n"); } }
From source file:com.netease.news.classifier.naivebayes.TrainNaiveBayesJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from w ww . j a v a2s. com addOutputOption(); addOption(LABELS, "l", "comma-separated list of labels to include in training", false); addOption(buildOption(EXTRACT_LABELS, "el", "Extract the labels from the input", false, false, "")); addOption(ALPHA_I, "a", "smoothing parameter", String.valueOf(1.0f)); addOption( buildOption(TRAIN_COMPLEMENTARY, "c", "train complementary?", false, false, String.valueOf(false))); addOption(LABEL_INDEX, "li", "The path to store the label index in", false); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { HadoopUtil.delete(getConf(), getOutputPath()); HadoopUtil.delete(getConf(), getTempPath()); } Path labPath; String labPathStr = getOption(LABEL_INDEX); if (labPathStr != null) { labPath = new Path(labPathStr); } else { labPath = getTempPath(LABEL_INDEX); } long labelSize = createLabelIndex(labPath); float alphaI = Float.parseFloat(getOption(ALPHA_I)); boolean trainComplementary = Boolean.parseBoolean(getOption(TRAIN_COMPLEMENTARY)); HadoopUtil.setSerializations(getConf()); HadoopUtil.cacheFiles(labPath, getConf()); //add up all the vectors with the same labels, while mapping the labels into our index Job indexInstances = prepareJob(getInputPath(), getTempPath(SUMMED_OBSERVATIONS), SequenceFileInputFormat.class, IndexInstancesMapper.class, IntWritable.class, VectorWritable.class, VectorSumReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); indexInstances.setCombinerClass(VectorSumReducer.class); boolean succeeded = indexInstances.waitForCompletion(true); if (!succeeded) { return -1; } //sum up all the weights from the previous step, per label and per feature Job weightSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(WEIGHTS), SequenceFileInputFormat.class, WeightsMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); weightSummer.getConfiguration().set(WeightsMapper.NUM_LABELS, String.valueOf(labelSize)); weightSummer.setCombinerClass(VectorSumReducer.class); succeeded = weightSummer.waitForCompletion(true); if (!succeeded) { return -1; } //put the per label and per feature vectors into the cache HadoopUtil.cacheFiles(getTempPath(WEIGHTS), getConf()); //calculate the Thetas, write out to LABEL_THETA_NORMALIZER vectors -- // TODO: add reference here to the part of the Rennie paper that discusses this Job thetaSummer = prepareJob(getTempPath(SUMMED_OBSERVATIONS), getTempPath(THETAS), SequenceFileInputFormat.class, ThetaMapper.class, Text.class, VectorWritable.class, VectorSumReducer.class, Text.class, VectorWritable.class, SequenceFileOutputFormat.class); thetaSummer.setCombinerClass(VectorSumReducer.class); thetaSummer.getConfiguration().setFloat(ThetaMapper.ALPHA_I, alphaI); thetaSummer.getConfiguration().setBoolean(ThetaMapper.TRAIN_COMPLEMENTARY, trainComplementary); /* TODO(robinanil): Enable this when thetanormalization works. succeeded = thetaSummer.waitForCompletion(true); if (!succeeded) { return -1; }*/ //validate our model and then write it out to the official output getConf().setFloat(ThetaMapper.ALPHA_I, alphaI); NaiveBayesModel naiveBayesModel = BayesUtils.readModelFromDir(getTempPath(), getConf()); naiveBayesModel.validate(); naiveBayesModel.serialize(getOutputPath(), getConf()); return 0; }