List of usage examples for org.apache.hadoop.mapred JobConf setNumReduceTasks
public void setNumReduceTasks(int n)
From source file:org.archive.hadoop.jobs.ArchiveFileExtractor.java
License:Apache License
/** * Run the job./*from w w w . j a v a 2 s . com*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { printUsage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("Archive File Extractor"); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // turn off speculative execution job.setBoolean("mapred.map.tasks.speculative.execution", false); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); //tolerate task exceptions job.setBoolean("soft", false); int arg = 0; int numMaps = 10; String DEFAULT_WARC_PATTERN = "software: %s Extractor\r\n" + "format: WARC File Format 1.0\r\n" + "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n" + "publisher: Internet Archive\r\n" + "created: %s\r\n\r\n"; String warcHeaderString = String.format(DEFAULT_WARC_PATTERN, IAUtils.COMMONS_VERSION, DateUtils.getLog17Date(System.currentTimeMillis())); while (arg < args.length - 1) { if (args[arg].equals("-soft")) { job.setBoolean("soft", true); arg++; } else if (args[arg].equals("-mappers")) { arg++; numMaps = Integer.parseInt(args[arg]); job.setNumMapTasks(numMaps); arg++; } else if (args[arg].equals("-timestamp14")) { arg++; String timestamp14 = DateUtils.get14DigitDate(DateUtils.parse14DigitDate(args[arg])); job.set("timestamp14", timestamp14); arg++; } else if (args[arg].equals("-warc-header-local-file")) { arg++; File f = new File(args[arg]); FileInputStream fis = new FileInputStream(f); warcHeaderString = IOUtils.toString(fis, "UTF-8"); arg++; } else if (args[arg].equals("-hmacname")) { arg++; String hmacName = args[arg]; job.set("hmacName", hmacName); arg++; } else if (args[arg].equals("-hmacsignature")) { arg++; String hmacSignature = args[arg]; job.set("hmacSignature", hmacSignature); arg++; } else if (args[arg].equals("-timeout")) { arg++; int taskTimeout = Integer.parseInt(args[arg]); job.setInt("mapred.task.timeout", taskTimeout); arg++; } else if (args[arg].equals("-failpct")) { arg++; int failPct = Integer.parseInt(args[arg]); job.setInt("mapred.max.map.failures.percent", failPct); arg++; } else { break; } } job.set("warcHeaderString", warcHeaderString); if (args.length - 2 != arg) { printUsage(); return 1; } Path inputPath = new Path(args[arg]); arg++; String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); Path outputPath = new Path(outputDir); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(ArchiveFileExtractorMapper.class); job.setJarByClass(ArchiveFileExtractor.class); TextInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.hadoop.jobs.CDXGenerator.java
License:Apache License
/** * Run the job./*from w ww . j a va2s. c om*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("CDX Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); // keep job running despite some failures in generating CDXs job.setBoolean("strictMode", false); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(CDXGeneratorMapper.class); job.setJarByClass(CDXGenerator.class); int arg = 0; if (args[arg].equals("-strictMode")) { job.setBoolean("strictMode", true); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to CDXGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.hadoop.jobs.WARCMetadataRecordGenerator.java
License:Apache License
/** * Run the job./*from w w w. jav a2 s.co m*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("WARCMetadataRecord Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(WARCMetadataRecordGeneratorMapper.class); job.setJarByClass(WARCMetadataRecordGenerator.class); //extract outlinks by default job.set("outputType", "outlinks"); int arg = 0; if (args[arg].equals("-hopinfo")) { job.set("outputType", "hopinfo"); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to WARCMetadataRecordGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.hadoop.jobs.WATGenerator.java
License:Apache License
/** * Run the job.//from w w w . j a v a 2 s . c o m */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("WAT Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); // keep job running despite some failures in generating WATs job.setBoolean("strictMode", false); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(WATGeneratorMapper.class); job.setJarByClass(WATGenerator.class); int arg = 0; if (args[arg].equals("-strictMode")) { job.setBoolean("strictMode", true); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to WATGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.jbs.Parse.java
License:Apache License
/** * Run the job./*w ww . j a v a 2 s.c o m*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } FileSystem fs = FileSystem.get(getConf()); // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("jbs.Parse " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // Use the Parse-specific output format. job.setOutputFormat(PerMapOutputFormat.class); // Use our ParseMapper, with output keys and values of type // Text. job.setMapperClass(ParseMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Configure the input and output paths, from the command-line. Path outputDir = new Path(args[0]); FileOutputFormat.setOutputPath(job, outputDir); boolean atLeastOneInput = false; for (int i = 1; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); Path outputPath = new Path(outputDir, inputPath.getName()); if (fs.exists(outputPath)) { LOG.debug("Output path already exists: " + outputPath); } else { atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } } if (!atLeastOneInput) { LOG.info("No input files to parse."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.asayler.WikiTitleCount.java
License:Apache License
/** * The main driver for wikititlecount map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker./*from w w w . j a va2 s.c o m*/ */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), WikiTitleCount.class); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int num_maps = 1; int num_reducers = 1; conf.setJobName("wikititlecount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MapClass.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); /** Set Default Mappers */ num_maps = (int) (cluster.getMaxMapTasks()); /** Set Default Mappers */ num_reducers = (int) (cluster.getMaxReduceTasks() * 0.9); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { other_args.add(args[i]); } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); /* Set Mappers and Reducer */ conf.setNumMapTasks(num_maps); conf.setNumReduceTasks(num_reducers); JobClient.runJob(conf); return 0; }
From source file:org.asayler.WikiTitleSort.java
License:Apache License
/** * The main driver for wikititlecount map/reduce program. * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker./*from w w w.ja va2 s. c o m*/ */ public int run(String[] args) throws Exception { JobConf conf = new JobConf(getConf(), WikiTitleSort.class); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int num_maps = 1; final int num_reducers = 1; conf.setJobName("wikititlesort"); conf.setMapperClass(MapClass.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); /** Set Default Mappers */ num_maps = (int) (cluster.getMaxMapTasks()); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { other_args.add(args[i]); } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } // Make sure there are exactly 2 parameters left. if (other_args.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(conf, other_args.get(0)); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); /* Set Mappers and Reducer */ conf.setNumMapTasks(num_maps); conf.setNumReduceTasks(num_reducers); JobClient.runJob(conf); return 0; }
From source file:org.cloudata.core.PerformanceTest.java
License:Apache License
private void runNIsMoreThanOne(final String cmd) throws IOException { checkTable();/*from ww w . j a v a2s .co m*/ // Run a mapreduce job. Run as many maps as asked-for clients. // Before we start up the job, write out an input file with instruction // per client regards which row they are to start on. Path inputDir = writeInputFile(this.conf); this.conf.set(EvaluationMapTask.CMD_KEY, cmd); JobConf job = new JobConf(this.conf, this.getClass()); FileInputFormat.addInputPath(job, inputDir); job.setInputFormat(TextInputFormat.class); job.setJobName("Cloudata Performance Evaluation"); job.setMapperClass(EvaluationMapTask.class); job.setMaxMapAttempts(1); job.setMaxReduceAttempts(1); job.setNumMapTasks(this.N * 10); // Ten maps per client. job.setNumReduceTasks(1); job.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(inputDir, "outputs")); JobClient.runJob(job); }
From source file:org.cloudata.core.tablet.backup.BackupBinaryJob.java
License:Apache License
public void runBackUp(String tableName, String outputPath) throws IOException { CloudataConf nconf = new CloudataConf(); CloudataFileSystem fs = CloudataFileSystem.get(nconf); if (fs.exists(new GPath(outputPath))) { throw new IOException("Output path already exists:" + outputPath); }//from w w w. j av a2 s. co m if (!CTable.existsTable(nconf, tableName)) { throw new IOException("No Table:" + tableName); } CTable ctable = CTable.openTable(nconf, tableName); String columns = ""; for (String eachColumn : ctable.getTableSchema().getColumnsArray()) { columns += eachColumn + ","; } columns = columns.substring(0, columns.length() - 1); JobConf jobConf = new JobConf(BackupBinaryJob.class); jobConf.setMapperClass(BackupBinaryMap.class); jobConf.setInputFormat(BackupTabletInputFormat.class); jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, tableName); jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, columns); FileOutputFormat.setOutputPath(jobConf, new Path(outputPath)); jobConf.setMapOutputKeyClass(BytesWritable.class); jobConf.setMapOutputValueClass(BytesWritable.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); //map only jobConf.setNumReduceTasks(0); JobClient.runJob(jobConf); }
From source file:org.cloudata.core.tablet.backup.BackupJob.java
License:Apache License
/** * mapreduce job // www. j a v a2 s. com * @param tableName * @param outputPath * @throws IOException */ public void runBackUp(String tableName, String outputPath) throws IOException { CloudataConf nconf = new CloudataConf(); CloudataFileSystem fs = CloudataFileSystem.get(nconf); if (fs.exists(new GPath(outputPath))) { throw new IOException("Output path already exists:" + outputPath); } if (!CTable.existsTable(nconf, tableName)) { throw new IOException("No Table:" + tableName); } CTable ctable = CTable.openTable(nconf, tableName); String columns = ""; for (String eachColumn : ctable.getTableSchema().getColumnsArray()) { columns += eachColumn + ","; } columns = columns.substring(0, columns.length() - 1); String jobName = tableName + " backup"; JobConf jobConf = new JobConf(BackupJob.class); jobConf.setJobName(jobName); jobConf.setMapperClass(BackupMap.class); jobConf.setInputFormat(BackupTabletInputFormat.class); jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, tableName); jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, columns); FileOutputFormat.setOutputPath(jobConf, new Path(outputPath)); jobConf.set("mapred.textoutputformat.separator", ","); jobConf.setOutputFormat(TextOutputFormat.class); //map only jobConf.setNumReduceTasks(0); JobClient.runJob(jobConf); }