List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:com.mapr.db.utils.ImportCSV_MR.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { System.out.println("MapR-DB JSON Tables - Import CSV" + "\nUsage:\n" + "\tParam 1: JSON Table Path (MapR-FS)\n" + "\tParam 2: Text File Path (Local-FS)\n" + "\tParam 3: Text File Delimiter (Local-FS)\n" + "\tParam 4: Schema File Path (Local-FS)\n"); System.exit(-1);//from w w w. j a v a2 s .com } outputTable = args[0].toString().trim(); inputDir = args[1].toString().trim(); delimiter = args[2].toString().trim(); schemaFile = args[3].toString().trim(); BasicConfigurator.configure(); Logger.getRootLogger().setLevel(Level.ERROR); ImportCSV_MR imp = new ImportCSV_MR(); imp.readSchema(schemaFile); imp.printSchema(); Job job = Job.getInstance(conf, "ImportCSV_MR"); job.setJarByClass(ImportCSV_MR.class); job.setMapperClass(MyMapper.class); conf = job.getConfiguration(); conf.setStrings("io.serializations", new String[] { conf.get("io.serializations"), JSONDocumentSerialization.class.getName() }); conf.set("countColumnsInSchema", String.valueOf(countColumnsInSchema)); conf.set("delimiter", delimiter); conf.set("tablePath", outputTable); String valueTypes[] = valueTypesInSchema.toArray(new String[valueTypesInSchema.size()]); conf.setStrings("valueTypesInSchema", valueTypes); String columnNames[] = columnNamesInSchema.toArray(new String[columnNamesInSchema.size()]); conf.setStrings("columnNamesInSchema", columnNames); //Deciding the appropriate Input format class along with their input path FileInputFormat.addInputPath(job, new Path(inputDir)); job.setInputFormatClass(TextInputFormat.class); //Mapper output record key and value class job.setMapOutputKeyClass(ByteBufWritableComparable.class); job.setMapOutputValueClass(DBDocumentImpl.class); //Deciding the appropriate Output format class along with their input path conf.set("maprdb.mapred.outputtable", outputTable); job.setOutputFormatClass(TableOutputFormat.class); //Reducer output record key and value class job.setNumReduceTasks(0); boolean isJobSuccessful = job.waitForCompletion(true); System.exit(job.waitForCompletion(true) ? 0 : 1); return 0; }
From source file:com.mb.saas.bi.job.WordCountJob.java
License:Apache License
public static boolean runHadoopMapReduceJob() throws Exception { System.setProperty("HADOOP_USER_NAME", "hadoop"); File jarFile = UploadResource.createTempJar("bin"); ClassLoader classLoader = UploadResource.getClassLoader(); Thread.currentThread().setContextClassLoader(classLoader); Configuration conf = new Configuration(); conf.set("fs.defaultFS", "hdfs://mbcluster/"); conf.set("dfs.nameservices", "mbcluster"); conf.set("dfs.ha.namenodes.mbcluster", "ns1,ns2"); conf.set("dfs.namenode.rpc-address.mbcluster.ns1", "master:4001"); conf.set("dfs.namenode.rpc-address.mbcluster.ns2", "backup:4001"); conf.set("dfs.client.failover.proxy.provider.mbcluster", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"); Job job = new Job(conf, "word count"); job.setJarByClass(WordCountJob.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); if (jarFile != null) ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath()); boolean isMapReduceJarSetted = false; String hadoopMapReduceJar = "F:/henry_projects/mbHiveAnalyzer/t.jar"; File file = new File(hadoopMapReduceJar); if (file.exists()) { ((JobConf) job.getConfiguration()).setJar(hadoopMapReduceJar); isMapReduceJarSetted = true;// ww w. java 2s . co m } if (!isMapReduceJarSetted && jarFile != null) ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath()); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path("/input/wordcount.txt")); FileOutputFormat.setOutputPath(job, new Path("/output/001")); System.exit(job.waitForCompletion(true) ? 0 : 1); return true; }
From source file:com.mb.saas.bi.job.WordCountJob.java
License:Apache License
public static void main(String[] args) throws Exception { System.setProperty("HADOOP_USER_NAME", "hadoop"); File jarFile = UploadResource.createTempJar("bin"); System.setProperty("hadoop.home.dir", "F:/hadoop"); ClassLoader classLoader = UploadResource.getClassLoader(); Thread.currentThread().setContextClassLoader(classLoader); Configuration conf = new Configuration(); // conf.set("fs.defaultFS", "hdfs://slave1:4001"); // conf.set("mapreduce.framework.name", "yarn"); // conf.set("yarn.resourcemanager.address", "master:8032"); // conf.set("yarn.resourcemanager.scheduler.address", "master:8030"); conf.set("fs.defaultFS", "hdfs://mbcluster/"); conf.set("dfs.nameservices", "mbcluster"); conf.set("dfs.ha.namenodes.mbcluster", "ns1,ns2"); conf.set("dfs.namenode.rpc-address.mbcluster.ns1", "master:4001"); conf.set("dfs.namenode.rpc-address.mbcluster.ns2", "backup:4001"); conf.set("dfs.client.failover.proxy.provider.mbcluster", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"); conf.set("mapred.remote.os", "Linux"); System.out.println(conf.get("mapred.remote.os")); // conf.set("mapreduce.job.reduces", "2"); // conf.set("mapreduce.tasktracker.map.tasks.maximum", "8"); // conf.set("mapreduce.input.fileinputformat.split.maxsize","123"); Job job = new Job(conf, "word count"); job.setJarByClass(WordCountJob.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); if (jarFile != null) ((JobConf) job.getConfiguration()).setJar(jarFile.getAbsolutePath()); // job.setMaxMapAttempts(2); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path("/input/wordcount2.txt")); // FileInputFormat.addInputPath(job, new Path("/input/wordcount2.txt")); FileOutputFormat.setOutputPath(job, new Path("/output/001002")); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.metamx.druid.indexer.IndexGeneratorJob.java
License:Open Source License
public boolean run() { try {//from w w w . ja v a2 s.c om Job job = new Job(new Configuration(), String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals())); job.getConfiguration().set("io.sort.record.percent", "0.23"); for (String propName : System.getProperties().stringPropertyNames()) { Configuration conf = job.getConfiguration(); if (propName.startsWith("hadoop.")) { conf.set(propName.substring("hadoop.".length()), System.getProperty(propName)); } } job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(IndexGeneratorMapper.class); job.setMapOutputValueClass(Text.class); SortableBytes.useSortableBytesAsMapOutputKey(job); job.setNumReduceTasks(Iterables.size(config.getAllBuckets())); job.setPartitionerClass(IndexGeneratorPartitioner.class); job.setReducerClass(IndexGeneratorReducer.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(IndexGeneratorOutputFormat.class); FileOutputFormat.setOutputPath(job, config.makeIntermediatePath()); config.addInputPaths(job); config.intoConfiguration(job); job.setJarByClass(IndexGeneratorJob.class); job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); boolean success = job.waitForCompletion(true); Counter invalidRowCount = job.getCounters() .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER); jobStats.setInvalidRowCount(invalidRowCount.getValue()); return success; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:com.ml.hadoop.nlp.DocumentProcessor.java
License:Apache License
/** * Convert the input documents into token array using the {@link StringTuple} The input documents has to be * in the {@link org.apache.hadoop.io.SequenceFile} format * //from w w w . ja va 2 s . co m * @param input * input directory of the documents in {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the {@link StringTuple} token array of each document has to be created * @param analyzerClass * The Lucene {@link Analyzer} for tokenizing the UTF-8 text */ public static void tokenizeDocuments(Path input, Class<? extends Analyzer> analyzerClass, Path output, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.set(ANALYZER_CLASS, analyzerClass.getName()); Job job = new Job(conf); job.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: " + input); job.setJarByClass(DocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(SequenceFileTokenizerMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:com.ML_Hadoop.K_meansClustering.K_meansClusteringMapReduce.java
public static void main(String[] args) throws Exception { int iteration = 0, num_of_iteration = 30; int feature_size = 2; FileSystem fs;/*w w w. j a v a 2 s . c o m*/ int number_of_clusters = 2; do { Configuration conf = new Configuration(); fs = FileSystem.get(conf); Job job = new Job(conf, "K_meansClusteringMapReduce"); job.setJarByClass(K_meansClusteringMapReduce.class); conf = job.getConfiguration(); // This line is mandatory. job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FloatArrayWritable.class); job.setMapperClass(K_meansClusteringMap.class); job.setReducerClass(K_meansClusteringReduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); // set number of reducers to one. FileInputFormat.addInputPath(job, new Path(args[0])); Path out = new Path(args[1]); if (fs.exists(out)) fs.delete(out, true); FileOutputFormat.setOutputPath(job, out); number_of_clusters = Integer.parseInt(args[2]); num_of_iteration = Integer.parseInt(args[3]); feature_size = Integer.parseInt(args[4]); conf.setInt("number_of_clusters", number_of_clusters); conf.setInt("feature_size", feature_size); conf.setInt("current_iteration_num", iteration); try { job.waitForCompletion(true); iteration++; } catch (IOException e) { e.printStackTrace(); } } while (iteration < num_of_iteration); }
From source file:com.ML_Hadoop.MultipleLinearRegression.MultipleLinearRegressionMapReduce.java
public static void main(String[] args) throws Exception { String[] theta;//from w ww. j a v a 2 s. c o m int iteration = 0, num_of_iteration = 1; int feature_size = 0, input_data_size = 0; FileSystem fs; Float alpha = 0.1f; do { Configuration conf = new Configuration(); fs = FileSystem.get(conf); Job job = new Job(conf, "LinearRegressionMapReduce"); job.setJarByClass(MultipleLinearRegressionMapReduce.class); // the following two lines are needed for propagating "theta" conf = job.getConfiguration(); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FloatWritable.class); job.setMapperClass(MultipleLinearRegressionMap.class); job.setReducerClass(MultipleLinearRegressionReduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); // set mapred.reduce.tasks = 1 (only one reducer) FileInputFormat.addInputPath(job, new Path(args[0])); Path out = new Path(args[1]); if (fs.exists(out)) fs.delete(out, true); FileOutputFormat.setOutputPath(job, out); alpha = Float.parseFloat(args[2]); num_of_iteration = Integer.parseInt(args[3]); feature_size = Integer.parseInt(args[4]); input_data_size = Integer.parseInt(args[5]); conf.setFloat("alpha", alpha); conf.setInt("feature_size", feature_size); conf.setInt("input_data_size", input_data_size); conf.setInt("iteration", iteration); theta = new String[feature_size]; if (iteration == 0) { // first iteration for (int i = 0; i < theta.length; i++) theta[i] = "0.0"; conf.setStrings("theta", theta); } else { try { String uri = "/user/hduser/theta.txt"; fs = FileSystem.get(conf); //FSDataInputStream in = fs.open(new Path(uri)); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(uri)))); theta = br.readLine().split(","); } catch (Exception e) { } conf.setStrings("theta", theta); } for (int i = 0; i < theta.length; i++) System.out.println("In MapRedce main function: theta[ " + i + " ]" + theta[i]); try { job.waitForCompletion(true); iteration++; } catch (IOException e) { e.printStackTrace(); } } while (iteration < num_of_iteration); }
From source file:com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features.NaiveBayesClassifierMapReduce_Continuous_Features.java
/** * @param args//from w w w .java 2 s . c o m * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { int number_of_classes = 1; int number_of_features = 1; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Job job = new Job(conf, "NaiveBayesClassifierMapReduce_Continuous_Features"); job.setJarByClass(NaiveBayesClassifierMapReduce_Continuous_Features.class); conf = job.getConfiguration(); // This line is mandatory. job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FloatArrayWritable.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(MapArrayWritable.class); job.setMapperClass(NaiveBayesClassifierMap_Continuous_Features.class); job.setReducerClass(NaiveBayesClassifierReduce_Continuous_Features.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path(args[0])); Path out = new Path(args[1]); if (fs.exists(out)) fs.delete(out, true); FileOutputFormat.setOutputPath(job, out); number_of_classes = Integer.parseInt(args[2]); number_of_features = Integer.parseInt(args[3]); conf.setInt("number_of_classes", number_of_classes); conf.setInt("number_of_features", number_of_features); try { job.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); } }
From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java
License:Apache License
/** * Configures the MapReduce reducer for the job. * * @param job The Hadoop MR job.// w ww. j av a 2 s.c om * @throws IOException If there is an error. */ protected void configureReducer(Job job) throws IOException { final FijiReducer<?, ?, ?, ?> reducer = getReducer(); if (null == reducer) { LOG.info("No reducer provided. This will be a map-only job"); job.setNumReduceTasks(0); // Set the job output key/value classes based on what the map output key/value classes were // since this a map-only job. job.setOutputKeyClass(job.getMapOutputKeyClass()); Schema mapOutputKeySchema = AvroJob.getMapOutputKeySchema(job.getConfiguration()); if (null != mapOutputKeySchema) { AvroJob.setOutputKeySchema(job, mapOutputKeySchema); } job.setOutputValueClass(job.getMapOutputValueClass()); Schema mapOutputValueSchema = AvroJob.getMapOutputValueSchema(job.getConfiguration()); if (null != mapOutputValueSchema) { AvroJob.setOutputValueSchema(job, mapOutputValueSchema); } return; } if (reducer instanceof Configurable) { ((Configurable) reducer).setConf(job.getConfiguration()); } job.setReducerClass(reducer.getClass()); // Set output key class. Class<?> outputKeyClass = reducer.getOutputKeyClass(); job.setOutputKeyClass(outputKeyClass); Schema outputKeyWriterSchema = AvroMapReduce.getAvroKeyWriterSchema(reducer); if (AvroKey.class.isAssignableFrom(outputKeyClass)) { if (null == outputKeyWriterSchema) { throw new JobConfigurationException("Using AvroKey output, but a writer schema was not provided. " + "Did you forget to implement AvroKeyWriter in your FijiReducer?"); } AvroJob.setOutputKeySchema(job, outputKeyWriterSchema); } else if (null != outputKeyWriterSchema) { throw new JobConfigurationException( reducer.getClass().getName() + ".getAvroKeyWriterSchema() returned a non-null Schema" + " but the output key class was not AvroKey."); } // Set output value class. Class<?> outputValueClass = reducer.getOutputValueClass(); job.setOutputValueClass(outputValueClass); Schema outputValueWriterSchema = AvroMapReduce.getAvroValueWriterSchema(reducer); if (AvroValue.class.isAssignableFrom(outputValueClass)) { if (null == outputValueWriterSchema) { throw new JobConfigurationException("Using AvroValue output, but a writer schema was not provided. " + "Did you forget to implement AvroValueWriter in your FijiReducer?"); } AvroJob.setOutputValueSchema(job, outputValueWriterSchema); } else if (null != outputValueWriterSchema) { throw new JobConfigurationException( reducer.getClass().getName() + ".getAvroValueWriterSchema() returned a non-null Schema" + " but the output value class was not AvroValue."); } }
From source file:com.moz.fiji.mapreduce.output.FijiTableMapReduceJobOutput.java
License:Apache License
/** {@inheritDoc} */ @Override/*w w w . ja v a 2 s. c om*/ public void configure(Job job) throws IOException { // sets Hadoop output format according to getOutputFormatClass() super.configure(job); final Configuration conf = job.getConfiguration(); conf.set(FijiConfKeys.FIJI_OUTPUT_TABLE_URI, mTableURI.toString()); job.setNumReduceTasks(getNumReduceTasks()); // Adds HBase dependency jars to the distributed cache so they appear on the task classpath: GenericTableMapReduceUtil.addAllDependencyJars(job); }