List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:com.wipro.ats.bdre.dq.DQDriver.java
License:Apache License
@Override public int run(String[] arg) throws Exception { String processId = arg[0];/* www .j a va 2s . c o m*/ String sPath = arg[1]; String destDir = arg[2]; Properties props = new GetProperties().getProperties(processId, "dq"); LOGGER.debug("props=" + props); Configuration conf = getConf(); conf.set("dq.process.id", processId); Job job = Job.getInstance(conf); job.setJobName("Data Quality " + processId); job.setJarByClass(DQDriver.class); job.setMapperClass(DQMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //Reducer is not required job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); Path inputFilePath = new Path(sPath); FileInputFormat.addInputPath(job, inputFilePath); FileOutputFormat.setOutputPath(job, removeIfExistAndSetOutputPath(conf, destDir)); MultipleOutputs.addNamedOutput(job, DQConstants.GOOD_RECORDS_FILE, TextOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, DQConstants.BAD_RECORDS_FILE, TextOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, DQConstants.FILE_REPORT_FILE, TextOutputFormat.class, Text.class, NullWritable.class); if (!job.waitForCompletion(true)) { return 1; } Path outputDir = new Path(destDir); FileSystem srcFs = outputDir.getFileSystem(getConf()); FileSystem destFs = outputDir.getFileSystem(getConf()); //Valid Records Path goodFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_GOOD_RECORD_OUTPUT_DIR); //Input and quality filtered file should have same name (but different path) Path goodDestFile = new Path(destDir + "/" + inputFilePath.getName()); if (srcFs.exists(goodFilesSrcDir)) { FileUtil.copyMerge(srcFs, goodFilesSrcDir, destFs, goodDestFile, true, conf, ""); } // Invalid Records Path badFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_BAD_RECORD_OUTPUT_DIR); Path badDestFile = new Path(destDir + "/" + DQConstants.BAD_RECORDS_FILE); if (srcFs.exists(badFilesSrcDir)) { FileUtil.copyMerge(srcFs, badFilesSrcDir, destFs, badDestFile, true, conf, ""); } // Preparing report aggregation job Job fileReportAggregationJob = Job.getInstance(conf); fileReportAggregationJob.setJobName("File Report Computing " + processId); fileReportAggregationJob.setJarByClass(DQMain.class); fileReportAggregationJob.setMapperClass(DQFileReportMapper.class); fileReportAggregationJob.setMapOutputKeyClass(Text.class); fileReportAggregationJob.setMapOutputValueClass(IntWritable.class); fileReportAggregationJob.setReducerClass(DQFileReportReducer.class); fileReportAggregationJob.setOutputKeyClass(Text.class); fileReportAggregationJob.setOutputValueClass(Text.class); fileReportAggregationJob.setNumReduceTasks(1); Path fileReportDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_REPORT_OUTPUT_DIR); Path fileReportOutputDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER); FileInputFormat.addInputPath(fileReportAggregationJob, fileReportDir); FileOutputFormat.setOutputPath(fileReportAggregationJob, fileReportOutputDir); if (!fileReportAggregationJob.waitForCompletion(true)) { return 1; } // Merge Report Records MR stuffs Path reportsSrcDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER); Path reportsDestFile = new Path(destDir + "/" + DQConstants.FILE_REPORT_FILE); FileUtil.copyMerge(srcFs, reportsSrcDir, destFs, reportsDestFile, true, conf, ""); Path reportDestFile = new Path(outputDir.toString() + "/" + DQConstants.FILE_REPORT_FILE); //Read the report file from HDFS and report the percentage DQStats dqStats = getQualityStats(getConf(), reportDestFile); LOGGER.info("Percentage of good records :" + dqStats.getGoodPercent()); props = new GetProperties().getProperties(processId, "dq"); String strThreshold = props.getProperty("min.pass.threshold.percent"); float threshold = Float.parseFloat(strThreshold); dqStats.setThreshold(threshold); //Update the result in metadata logResult(dqStats, processId, 0L); if (dqStats.getGoodPercent() < threshold) { LOGGER.error("DQ check did not pass"); throw new DQValidationException(dqStats); } LOGGER.info(dqStats); FileChecksum hdfsChecksum = destFs.getFileChecksum(goodDestFile); String fileHash = hdfsChecksum == null ? "0" : hdfsChecksum.toString(); //Return file info oozie params RegisterFileInfo registerFileInfo = new RegisterFileInfo(); registerFileInfo.setBatchId(null); registerFileInfo.setCreationTs(new Timestamp(new Date().getTime())); registerFileInfo.setFileHash(fileHash); registerFileInfo.setFileSize(destFs.getFileStatus(goodDestFile).getLen()); registerFileInfo.setPath(goodDestFile.toString()); registerFileInfo.setSubProcessId(Integer.parseInt(processId)); OozieUtil oozieUtil = new OozieUtil(); oozieUtil.persistBeanData(registerFileInfo, false); return 0; }
From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java
License:Apache License
@Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); String dir = conf.get(LindenJobConfig.INPUT_DIR, null); logger.info("input dir:" + dir); Path inputPath = new Path(StringUtils.unEscapeString(dir)); Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR)); String indexPath = conf.get(LindenJobConfig.INDEX_PATH); FileSystem fs = FileSystem.get(conf); if (fs.exists(outputPath)) { fs.delete(outputPath, true);/*ww w . j a v a2 s.c o m*/ } if (fs.exists(new Path(indexPath))) { fs.delete(new Path(indexPath), true); } int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1); Shard[] shards = createShards(indexPath, numShards); Shard.setIndexShards(conf, shards); //empty trash; (new Trash(conf)).expunge(); Job job = Job.getInstance(conf, "linden-hadoop-indexing"); job.setJarByClass(LindenJob.class); job.setMapperClass(LindenMapper.class); job.setCombinerClass(LindenCombiner.class); job.setReducerClass(LindenReducer.class); job.setMapOutputKeyClass(Shard.class); job.setMapOutputValueClass(IntermediateForm.class); job.setOutputKeyClass(Shard.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(IndexUpdateOutputFormat.class); job.setReduceSpeculativeExecution(false); job.setNumReduceTasks(numShards); String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL); if (lindenSchemaFile == null) { throw new IOException("no schema file is found"); } logger.info("Adding schema file: " + lindenSchemaFile); job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema")); String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL); if (lindenPropertiesFile == null) { throw new IOException("no linden properties file is found"); } logger.info("Adding linden properties file: " + lindenPropertiesFile); job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties")); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); Path[] inputs = FileInputFormat.getInputPaths(job); StringBuilder buffer = new StringBuilder(inputs[0].toString()); for (int i = 1; i < inputs.length; i++) { buffer.append(","); buffer.append(inputs[i].toString()); } logger.info("mapreduce.input.dir = " + buffer.toString()); logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString()); logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks()); logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS)); logger.info("mapreduce.input.format.class = " + job.getInputFormatClass()); logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass()); logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR)); job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed"); } return 0; }
From source file:com.xiaoxiaomo.mr.utils.kafka.HadoopJob.java
License:Apache License
public int run(String[] args) throws Exception { CommandLineParser parser = new PosixParser(); Options options = buildOptions();//from w ww . j a va2s. com CommandLine cmd = parser.parse(options, args); if (cmd.hasOption("h") || cmd.getArgs().length == 0) { printHelpAndExit(options); } String hdfsPath = cmd.getArgs()[0]; Configuration conf = getConf(); conf.setBoolean("mapred.map.tasks.speculative.execution", false); if (cmd.hasOption("topics")) { LOG.info("Using topics: " + cmd.getOptionValue("topics")); KafkaInputFormat.configureKafkaTopics(conf, cmd.getOptionValue("topics")); } else { printHelpAndExit(options); } KafkaInputFormat.configureZkConnection(conf, cmd.getOptionValue("zk-connect", "localhost:2181")); if (cmd.hasOption("consumer-group")) { CheckpointManager.configureUseZooKeeper(conf, cmd.getOptionValue("consumer-group", "dev-hadoop-loader")); } if (cmd.getOptionValue("autooffset-reset") != null) { KafkaInputFormat.configureAutoOffsetReset(conf, cmd.getOptionValue("autooffset-reset")); } JobConf jobConf = new JobConf(conf); if (cmd.hasOption("remote")) { String ip = cmd.getOptionValue("remote"); LOG.info("Default file system: hdfs://" + ip + ":8020/"); jobConf.set("fs.defaultFS", "hdfs://" + ip + ":8020/"); LOG.info("Remote jobtracker: " + ip + ":8021"); jobConf.set("mapred.job.tracker", ip + ":8021"); } Path jarTarget = new Path( getClass().getProtectionDomain().getCodeSource().getLocation() + "../kafka-hadoop-loader.jar"); if (new File(jarTarget.toUri()).exists()) { // running from IDE/ as maven jobConf.setJar(jarTarget.toUri().getPath()); LOG.info("Using target jar: " + jarTarget.toString()); } else { // running from jar remotely or locally jobConf.setJarByClass(getClass()); LOG.info("Using parent jar: " + jobConf.getJar()); } Job job = Job.getInstance(jobConf, "kafka.hadoop.loader"); job.setInputFormatClass(KafkaInputFormat.class); job.setMapperClass(HadoopJobMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(MultiOutputFormat.class); job.setNumReduceTasks(0); MultiOutputFormat.setOutputPath(job, new Path(hdfsPath)); MultiOutputFormat.setCompressOutput(job, cmd.getOptionValue("compress-output", "on").equals("on")); LOG.info("Output hdfs location: {}", hdfsPath); LOG.info("Output hdfs compression: {}", MultiOutputFormat.getCompressOutput(job)); return job.waitForCompletion(true) ? 0 : -1; }
From source file:com.yahoo.druid.hadoop.DruidInputFormatTest.java
License:Apache License
@Test public void testSampleMRJob() throws Exception { Job job = Job.getInstance(new Configuration(), "Druid-Loader-Sample-Test-Job"); job.getConfiguration().set("mapreduce.job.acl-view-job", "*"); job.getConfiguration().set("mapreduce.map.java.opts", "-Duser.timezone=UTC"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_OVERLORD_HOSTPORT, "localhost:" + overlordTestPort); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_SCHEMA, "{" + "\"dataSource\":\"testDataSource\"," + "\"interval\":\"1970-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z\"," + "\"granularity\":\"NONE\"," + "\"dimensions\":[\"host\"]," + "\"metrics\":[\"visited_sum\",\"unique_hosts\"]" + "}"); job.setMapperClass(SampleMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setInputFormatClass(DruidInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); String outputPath = tempFolder.newFolder() + "/out"; TextOutputFormat.setOutputPath(job, new Path(outputPath)); Assert.assertTrue(job.waitForCompletion(true)); //verify that the SampleMapper actually ran and verified the data Assert.assertTrue(FileUtils.readFileToString(new File(outputPath + "/part-m-00000")).startsWith("SUCCESS")); }
From source file:com.yahoo.druid.hadoop.example.SamplePrintMRJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { // When implementing tool Configuration conf = this.getConf(); // Create job Job job = new Job(conf, "Druid-Loader-Sample-Job"); job.setJarByClass(SamplePrintMRJob.class); // job.setJobName("Druid-Loader-Sample-Job"); job.getConfiguration().set("mapreduce.job.acl-view-job", "*"); job.getConfiguration().set("mapreduce.job.queuename", "default"); job.getConfiguration().set("mapreduce.map.java.opts", "-Duser.timezone=UTC"); //job.getConfiguration().set("mapreduce.map.memory.mb", "1024"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_STORAGE_STORAGE_DIR, "/tmp/druid/storage"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_OVERLORD_HOSTPORT, "localhost:8080"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_DATASOURCE, "wikipedia"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_INTERVAL, "2009-01-01T00:00:00.000/2050-01-01T00:00:00.000"); job.getConfiguration().set(DruidInputFormat.CONF_DRUID_SCHEMA_FILE, "/tmp/druid/schema/druid_fun_mr.json"); job.setMapperClass(DruidPrintMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(DateTime.class); job.setOutputValueClass(Map.class); job.setInputFormatClass(DruidInputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); System.out.println("Starting Druid Loader Sample Job....."); return job.waitForCompletion(true) ? 0 : 1; //System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.yahoo.glimmer.indexing.preprocessor.PrepTool.java
License:Open Source License
@Override public int run(String[] args) throws Exception { SimpleJSAP jsap = new SimpleJSAP(PrepTool.class.getName(), "RDF tuples pre-processor for Glimmer", new Parameter[] { new Switch(NO_CONTEXTS_ARG, 'C', NO_CONTEXTS_ARG, "Don't process the contexts for each tuple."), new FlaggedOption(ONTOLOGY_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'O', ONTOLOGY_ARG), new FlaggedOption(REDUCER_COUNT_ARG, JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'r', REDUCER_COUNT_ARG), new UnflaggedOption(INPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location for the input data."), new UnflaggedOption(OUTPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED, "HDFS location for the out data."), }); JSAPResult jsapResult = jsap.parse(args); if (!jsapResult.success()) { System.err.print(jsap.getUsage()); System.exit(1);/*from w w w . j a va 2s. co m*/ } Configuration config = getConf(); boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false); config.setBoolean(TuplesToResourcesMapper.INCLUDE_CONTEXTS_KEY, withContexts); // The ontology if any... String ontologyFilename = jsapResult.getString(ONTOLOGY_ARG); if (ontologyFilename != null) { // Load the ontology InputStream ontologyInputStream = new FileInputStream(ontologyFilename); OWLOntology ontology = OntologyLoader.load(ontologyInputStream); System.out.println( "Loaded ontology from " + ontologyFilename + " with " + ontology.getAxiomCount() + " axioms."); ArrayList<String> ontologyClasses = new ArrayList<String>(); for (OWLClass owlClass : ontology.getClassesInSignature()) { ontologyClasses.add(owlClass.getIRI().toString()); } System.out.println("Adding " + ontologyClasses.size() + " classes from ontology."); config.setStrings(TuplesToResourcesMapper.EXTRA_RESOURCES, ontologyClasses.toArray(new String[0])); } else { System.out.println("No ontology filename set in conf. No ontology has been loaded."); } Job job = Job.getInstance(config); job.setJarByClass(PrepTool.class); job.setJobName(PrepTool.class.getName() + "-part1-" + System.currentTimeMillis()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(TuplesToResourcesMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); int reducerCount = jsapResult.getInt(REDUCER_COUNT_ARG, DEFAULT_REDUCER_COUNT); job.setNumReduceTasks(reducerCount); if (reducerCount == 1) { // We assign 'global' ids in the reducer. For this to work, there // can be only one. But using just one reducer, we run out of local disk space during the // pre-reduce merge with big data sets like WCC. job.setReducerClass(ResourcesReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Object.class); job.setOutputFormatClass(ResourceRecordWriter.OutputFormat.class); } else { /* * TODO: Take the functionality of the reducer and move it to run on * the gateway. We then use n identity reducers, the output of which * will be read and merged as streams on the gateway. */ } FileInputFormat.setInputPaths(job, new Path(jsapResult.getString(INPUT_ARG))); Path outputDir = new Path(jsapResult.getString(OUTPUT_ARG)); FileOutputFormat.setOutputPath(job, outputDir); if (!job.waitForCompletion(true)) { System.err.println("Failed to process tuples from " + jsapResult.getString(INPUT_ARG)); return 1; } // IF THERE WAS ONLY ONE REDUCER WE NOW HAVE // One file per reducer containing lists of urls(recourses) for // subjects, predicates, objects and contexts. // One file per reducer that contains all resources. subjects + // predicates + objects + contexts. // One file per reducer that contains the subjects + all <predicate> // <object>|"Literal" <context> on that subject. // IF THERE WAS MORE THAN ONE REDUCER WE NOW HAVE N FILES THAT NEED TO BE MERGED ON THE GATEWAY. TODO. return 0; }
From source file:com.yahoo.labs.yamall.hadoop.Test.java
License:Open Source License
/** * Run the map/reduce job/*from w ww. j a va 2 s .c o m*/ */ public final int run(final String[] args) throws Exception { startLogger(Level.INFO); Configuration conf = getConf(); conf.set("yamall.vw_model", args[2]); conf.setIfUnset("yamall.bit_precision", "18"); conf.setIfUnset("yamall.parser", "vw"); // Print to screen all the options TreeMap<String, String> map = new TreeMap<String, String>(); for (Map.Entry<String, String> entry : conf) { map.put(entry.getKey(), entry.getValue()); } for (Map.Entry<String, String> entry : map.entrySet()) { System.out.printf("%s=%s\n", entry.getKey(), entry.getValue()); } Job job = Job.getInstance(conf, "Yamall Test on MapReduce"); job.setNumReduceTasks(1); job.setJarByClass(Test.class); job.setMapperClass(TestMapper.class); job.setMapOutputKeyClass(DoubleWritable.class); job.setReducerClass(TestReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(CompositeDoubleTextWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); MultipleOutputs.addNamedOutput(job, "out", TextOutputFormat.class, NullWritable.class, Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.yahoo.labs.yamall.hadoop.Train.java
License:Open Source License
/** * Run the map/reduce job//from w w w . j a v a 2 s.c o m */ public final int run(final String[] args) throws Exception { startLogger(Level.INFO); Configuration conf = getConf(); conf.set("yamall.output", args[1]); conf.setIfUnset("yamall.bit_precision", "18"); conf.setIfUnset("yamall.parser", "vw"); // Print to screen all the options TreeMap<String, String> map = new TreeMap<String, String>(); for (Map.Entry<String, String> entry : conf) { map.put(entry.getKey(), entry.getValue()); } for (Map.Entry<String, String> entry : map.entrySet()) { System.out.printf("%s=%s\n", entry.getKey(), entry.getValue()); } Job job = Job.getInstance(conf, "Yamall Train on MapReduce"); job.setNumReduceTasks(1); // important job.setJarByClass(Train.class); job.setMapperClass(TrainMapper.class); job.setMapOutputKeyClass(DoubleWritable.class); job.setMapOutputValueClass(InstanceOrHashMapWritable.class); job.setReducerClass(TrainReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.yahoo.semsearch.fastlinking.io.RepackWikipedia.java
License:Apache License
@SuppressWarnings("static-access") @Override//from w w w .j av a2 s . co m public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location") .create(OUTPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file") .create(MAPPING_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("block|record|none").hasArg() .withDescription("compression type").create(COMPRESSION_TYPE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg().withDescription("two-letter language code") .create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION); String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION); if (!"block".equals(compressionType) && !"record".equals(compressionType) && !"none".equals(compressionType)) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } // this is the default block size int blocksize = 1000000; Job job = Job.getInstance(getConf()); job.setJarByClass(RepackWikipedia.class); job.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language)); job.getConfiguration().set(DOCNO_MAPPING_FIELD, mappingFile); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - XML dump file: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - docno mapping data file: " + mappingFile); LOG.info(" - compression type: " + compressionType); LOG.info(" - language: " + language); if ("block".equals(compressionType)) { LOG.info(" - block size: " + blocksize); } job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); if ("none".equals(compressionType)) { FileOutputFormat.setCompressOutput(job, false); } else { FileOutputFormat.setCompressOutput(job, true); if ("record".equals(compressionType)) { SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize); } } if (language != null) { job.getConfiguration().set("wiki.language", language); } job.setInputFormatClass(WikipediaPageInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(WikipediaPageFactory.getWikipediaPageClass(language)); //job.setOutputValueClass(EnglishWikipediaPage.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(outputPath), true); return job.waitForCompletion(true) ? 0 : -1; }
From source file:com.yahoo.semsearch.fastlinking.io.WikipediaDocnoMappingBuilder.java
License:Apache License
@SuppressWarnings("static-access") @Override//w w w . ja v a 2 s .c o m public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file") .create(OUTPUT_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr|it").hasArg() .withDescription("two-letter language code").create(LANGUAGE_OPTION)); options.addOption(KEEP_ALL_OPTION, false, "keep all pages"); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION); boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION); String tmpPath = "tmp-" + WikipediaDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input: " + inputPath); LOG.info(" - output file: " + outputFile); LOG.info(" - keep all pages: " + keepAll); LOG.info(" - language: " + language); Job job = Job.getInstance(getConf()); job.setJarByClass(WikipediaDocnoMappingBuilder.class); job.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language)); job.getConfiguration().setBoolean(KEEP_ALL_OPTION, keepAll); if (language != null) { job.getConfiguration().set("wiki.language", language); } job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(tmpPath)); FileOutputFormat.setCompressOutput(job, false); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(WikipediaPageInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(tmpPath), true); if (job.waitForCompletion(true)) { // long cnt = keepAll ? job.getCounters().findCounter(PageTypes.TOTAL).getValue() : job.getCounters().findCounter(PageTypes.ARTICLE).getValue(); long cnt = job.getCounters() .findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue(); WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(getConf()), tmpPath + "/part-r-00000", (int) cnt, outputFile); FileSystem.get(getConf()).delete(new Path(tmpPath), true); return 0; } else { return -1; } }