Example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setNumReduceTasks.

Prototype

public void setNumReduceTasks(int tasks) throws IllegalStateException

Source Link

Document

Set the number of reduce tasks for the job.

Usage

From source file:com.wipro.ats.bdre.dq.DQDriver.java

License:Apache License

@Override
public int run(String[] arg) throws Exception {
    String processId = arg[0];/* www .j  a va 2s .  c o  m*/
    String sPath = arg[1];
    String destDir = arg[2];

    Properties props = new GetProperties().getProperties(processId, "dq");
    LOGGER.debug("props=" + props);
    Configuration conf = getConf();

    conf.set("dq.process.id", processId);
    Job job = Job.getInstance(conf);
    job.setJobName("Data Quality " + processId);
    job.setJarByClass(DQDriver.class);
    job.setMapperClass(DQMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    //Reducer is not required
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);
    Path inputFilePath = new Path(sPath);
    FileInputFormat.addInputPath(job, inputFilePath);
    FileOutputFormat.setOutputPath(job, removeIfExistAndSetOutputPath(conf, destDir));
    MultipleOutputs.addNamedOutput(job, DQConstants.GOOD_RECORDS_FILE, TextOutputFormat.class, Text.class,
            NullWritable.class);
    MultipleOutputs.addNamedOutput(job, DQConstants.BAD_RECORDS_FILE, TextOutputFormat.class, Text.class,
            NullWritable.class);
    MultipleOutputs.addNamedOutput(job, DQConstants.FILE_REPORT_FILE, TextOutputFormat.class, Text.class,
            NullWritable.class);

    if (!job.waitForCompletion(true)) {
        return 1;
    }

    Path outputDir = new Path(destDir);
    FileSystem srcFs = outputDir.getFileSystem(getConf());
    FileSystem destFs = outputDir.getFileSystem(getConf());

    //Valid Records
    Path goodFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_GOOD_RECORD_OUTPUT_DIR);
    //Input and quality filtered file should have same name (but different path)
    Path goodDestFile = new Path(destDir + "/" + inputFilePath.getName());
    if (srcFs.exists(goodFilesSrcDir)) {
        FileUtil.copyMerge(srcFs, goodFilesSrcDir, destFs, goodDestFile, true, conf, "");
    }
    // Invalid Records
    Path badFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_BAD_RECORD_OUTPUT_DIR);
    Path badDestFile = new Path(destDir + "/" + DQConstants.BAD_RECORDS_FILE);
    if (srcFs.exists(badFilesSrcDir)) {
        FileUtil.copyMerge(srcFs, badFilesSrcDir, destFs, badDestFile, true, conf, "");
    }

    // Preparing report aggregation job
    Job fileReportAggregationJob = Job.getInstance(conf);
    fileReportAggregationJob.setJobName("File Report Computing " + processId);
    fileReportAggregationJob.setJarByClass(DQMain.class);

    fileReportAggregationJob.setMapperClass(DQFileReportMapper.class);
    fileReportAggregationJob.setMapOutputKeyClass(Text.class);
    fileReportAggregationJob.setMapOutputValueClass(IntWritable.class);

    fileReportAggregationJob.setReducerClass(DQFileReportReducer.class);
    fileReportAggregationJob.setOutputKeyClass(Text.class);
    fileReportAggregationJob.setOutputValueClass(Text.class);

    fileReportAggregationJob.setNumReduceTasks(1);

    Path fileReportDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_REPORT_OUTPUT_DIR);
    Path fileReportOutputDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER);

    FileInputFormat.addInputPath(fileReportAggregationJob, fileReportDir);
    FileOutputFormat.setOutputPath(fileReportAggregationJob, fileReportOutputDir);

    if (!fileReportAggregationJob.waitForCompletion(true)) {
        return 1;
    }

    // Merge Report Records MR stuffs
    Path reportsSrcDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER);
    Path reportsDestFile = new Path(destDir + "/" + DQConstants.FILE_REPORT_FILE);
    FileUtil.copyMerge(srcFs, reportsSrcDir, destFs, reportsDestFile, true, conf, "");

    Path reportDestFile = new Path(outputDir.toString() + "/" + DQConstants.FILE_REPORT_FILE);
    //Read the report file from HDFS and report the percentage
    DQStats dqStats = getQualityStats(getConf(), reportDestFile);
    LOGGER.info("Percentage of good records :" + dqStats.getGoodPercent());
    props = new GetProperties().getProperties(processId, "dq");
    String strThreshold = props.getProperty("min.pass.threshold.percent");
    float threshold = Float.parseFloat(strThreshold);
    dqStats.setThreshold(threshold);
    //Update the result in metadata
    logResult(dqStats, processId, 0L);
    if (dqStats.getGoodPercent() < threshold) {
        LOGGER.error("DQ check did not pass");
        throw new DQValidationException(dqStats);
    }
    LOGGER.info(dqStats);
    FileChecksum hdfsChecksum = destFs.getFileChecksum(goodDestFile);
    String fileHash = hdfsChecksum == null ? "0" : hdfsChecksum.toString();
    //Return file info oozie params
    RegisterFileInfo registerFileInfo = new RegisterFileInfo();
    registerFileInfo.setBatchId(null);
    registerFileInfo.setCreationTs(new Timestamp(new Date().getTime()));
    registerFileInfo.setFileHash(fileHash);
    registerFileInfo.setFileSize(destFs.getFileStatus(goodDestFile).getLen());
    registerFileInfo.setPath(goodDestFile.toString());
    registerFileInfo.setSubProcessId(Integer.parseInt(processId));
    OozieUtil oozieUtil = new OozieUtil();
    oozieUtil.persistBeanData(registerFileInfo, false);

    return 0;
}

From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java

License:Apache License

@Override
public int run(String[] strings) throws Exception {
    Configuration conf = getConf();
    String dir = conf.get(LindenJobConfig.INPUT_DIR, null);
    logger.info("input dir:" + dir);
    Path inputPath = new Path(StringUtils.unEscapeString(dir));
    Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR));
    String indexPath = conf.get(LindenJobConfig.INDEX_PATH);

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);/*ww w . j  a v  a2 s.c o m*/
    }
    if (fs.exists(new Path(indexPath))) {
        fs.delete(new Path(indexPath), true);
    }

    int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1);
    Shard[] shards = createShards(indexPath, numShards);

    Shard.setIndexShards(conf, shards);

    //empty trash;
    (new Trash(conf)).expunge();

    Job job = Job.getInstance(conf, "linden-hadoop-indexing");
    job.setJarByClass(LindenJob.class);
    job.setMapperClass(LindenMapper.class);
    job.setCombinerClass(LindenCombiner.class);
    job.setReducerClass(LindenReducer.class);
    job.setMapOutputKeyClass(Shard.class);
    job.setMapOutputValueClass(IntermediateForm.class);
    job.setOutputKeyClass(Shard.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(IndexUpdateOutputFormat.class);
    job.setReduceSpeculativeExecution(false);
    job.setNumReduceTasks(numShards);

    String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL);
    if (lindenSchemaFile == null) {
        throw new IOException("no schema file is found");
    }
    logger.info("Adding schema file: " + lindenSchemaFile);
    job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema"));
    String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL);
    if (lindenPropertiesFile == null) {
        throw new IOException("no linden properties file is found");
    }
    logger.info("Adding linden properties file: " + lindenPropertiesFile);
    job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties"));

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    Path[] inputs = FileInputFormat.getInputPaths(job);
    StringBuilder buffer = new StringBuilder(inputs[0].toString());
    for (int i = 1; i < inputs.length; i++) {
        buffer.append(",");
        buffer.append(inputs[i].toString());
    }
    logger.info("mapreduce.input.dir = " + buffer.toString());
    logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString());
    logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks());
    logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS));
    logger.info("mapreduce.input.format.class = " + job.getInputFormatClass());
    logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass());
    logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR));

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed");
    }
    return 0;
}

From source file:com.xiaoxiaomo.mr.utils.kafka.HadoopJob.java

License:Apache License

public int run(String[] args) throws Exception {
    CommandLineParser parser = new PosixParser();
    Options options = buildOptions();//from w  ww  .  j a va2s. com
    CommandLine cmd = parser.parse(options, args);

    if (cmd.hasOption("h") || cmd.getArgs().length == 0) {
        printHelpAndExit(options);
    }

    String hdfsPath = cmd.getArgs()[0];
    Configuration conf = getConf();
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);

    if (cmd.hasOption("topics")) {
        LOG.info("Using topics: " + cmd.getOptionValue("topics"));
        KafkaInputFormat.configureKafkaTopics(conf, cmd.getOptionValue("topics"));
    } else {
        printHelpAndExit(options);
    }

    KafkaInputFormat.configureZkConnection(conf, cmd.getOptionValue("zk-connect", "localhost:2181"));
    if (cmd.hasOption("consumer-group")) {
        CheckpointManager.configureUseZooKeeper(conf,
                cmd.getOptionValue("consumer-group", "dev-hadoop-loader"));
    }

    if (cmd.getOptionValue("autooffset-reset") != null) {
        KafkaInputFormat.configureAutoOffsetReset(conf, cmd.getOptionValue("autooffset-reset"));
    }

    JobConf jobConf = new JobConf(conf);
    if (cmd.hasOption("remote")) {
        String ip = cmd.getOptionValue("remote");
        LOG.info("Default file system: hdfs://" + ip + ":8020/");
        jobConf.set("fs.defaultFS", "hdfs://" + ip + ":8020/");
        LOG.info("Remote jobtracker: " + ip + ":8021");
        jobConf.set("mapred.job.tracker", ip + ":8021");
    }

    Path jarTarget = new Path(
            getClass().getProtectionDomain().getCodeSource().getLocation() + "../kafka-hadoop-loader.jar");

    if (new File(jarTarget.toUri()).exists()) {
        // running from IDE/ as maven
        jobConf.setJar(jarTarget.toUri().getPath());
        LOG.info("Using target jar: " + jarTarget.toString());
    } else {
        // running from jar remotely or locally
        jobConf.setJarByClass(getClass());
        LOG.info("Using parent jar: " + jobConf.getJar());
    }

    Job job = Job.getInstance(jobConf, "kafka.hadoop.loader");

    job.setInputFormatClass(KafkaInputFormat.class);
    job.setMapperClass(HadoopJobMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(MultiOutputFormat.class);
    job.setNumReduceTasks(0);

    MultiOutputFormat.setOutputPath(job, new Path(hdfsPath));
    MultiOutputFormat.setCompressOutput(job, cmd.getOptionValue("compress-output", "on").equals("on"));

    LOG.info("Output hdfs location: {}", hdfsPath);
    LOG.info("Output hdfs compression: {}", MultiOutputFormat.getCompressOutput(job));

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.yahoo.druid.hadoop.DruidInputFormatTest.java

License:Apache License

@Test
public void testSampleMRJob() throws Exception {
    Job job = Job.getInstance(new Configuration(), "Druid-Loader-Sample-Test-Job");

    job.getConfiguration().set("mapreduce.job.acl-view-job", "*");
    job.getConfiguration().set("mapreduce.map.java.opts", "-Duser.timezone=UTC");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_OVERLORD_HOSTPORT, "localhost:" + overlordTestPort);
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_SCHEMA,
            "{" + "\"dataSource\":\"testDataSource\","
                    + "\"interval\":\"1970-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z\","
                    + "\"granularity\":\"NONE\"," + "\"dimensions\":[\"host\"],"
                    + "\"metrics\":[\"visited_sum\",\"unique_hosts\"]" + "}");

    job.setMapperClass(SampleMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);

    job.setInputFormatClass(DruidInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    String outputPath = tempFolder.newFolder() + "/out";
    TextOutputFormat.setOutputPath(job, new Path(outputPath));

    Assert.assertTrue(job.waitForCompletion(true));

    //verify that the SampleMapper actually ran and verified the data
    Assert.assertTrue(FileUtils.readFileToString(new File(outputPath + "/part-m-00000")).startsWith("SUCCESS"));
}

From source file:com.yahoo.druid.hadoop.example.SamplePrintMRJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // When implementing tool
    Configuration conf = this.getConf();

    // Create job
    Job job = new Job(conf, "Druid-Loader-Sample-Job");
    job.setJarByClass(SamplePrintMRJob.class);
    //    job.setJobName("Druid-Loader-Sample-Job");

    job.getConfiguration().set("mapreduce.job.acl-view-job", "*");
    job.getConfiguration().set("mapreduce.job.queuename", "default");
    job.getConfiguration().set("mapreduce.map.java.opts", "-Duser.timezone=UTC");
    //job.getConfiguration().set("mapreduce.map.memory.mb", "1024");

    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_STORAGE_STORAGE_DIR, "/tmp/druid/storage");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_OVERLORD_HOSTPORT, "localhost:8080");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_DATASOURCE, "wikipedia");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_INTERVAL,
            "2009-01-01T00:00:00.000/2050-01-01T00:00:00.000");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_SCHEMA_FILE, "/tmp/druid/schema/druid_fun_mr.json");

    job.setMapperClass(DruidPrintMapper.class);
    job.setNumReduceTasks(0);

    job.setOutputKeyClass(DateTime.class);
    job.setOutputValueClass(Map.class);

    job.setInputFormatClass(DruidInputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);

    System.out.println("Starting Druid Loader Sample Job.....");
    return job.waitForCompletion(true) ? 0 : 1;
    //System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.yahoo.glimmer.indexing.preprocessor.PrepTool.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {

    SimpleJSAP jsap = new SimpleJSAP(PrepTool.class.getName(), "RDF tuples pre-processor for Glimmer",
            new Parameter[] {
                    new Switch(NO_CONTEXTS_ARG, 'C', NO_CONTEXTS_ARG,
                            "Don't process the contexts for each tuple."),
                    new FlaggedOption(ONTOLOGY_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'O',
                            ONTOLOGY_ARG),
                    new FlaggedOption(REDUCER_COUNT_ARG, JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT,
                            JSAP.NOT_REQUIRED, 'r', REDUCER_COUNT_ARG),
                    new UnflaggedOption(INPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the input data."),
                    new UnflaggedOption(OUTPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the out data."), });

    JSAPResult jsapResult = jsap.parse(args);
    if (!jsapResult.success()) {
        System.err.print(jsap.getUsage());
        System.exit(1);/*from   w w w  .  j  a va  2s.  co m*/
    }

    Configuration config = getConf();

    boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false);
    config.setBoolean(TuplesToResourcesMapper.INCLUDE_CONTEXTS_KEY, withContexts);

    // The ontology if any...
    String ontologyFilename = jsapResult.getString(ONTOLOGY_ARG);
    if (ontologyFilename != null) {
        // Load the ontology
        InputStream ontologyInputStream = new FileInputStream(ontologyFilename);
        OWLOntology ontology = OntologyLoader.load(ontologyInputStream);
        System.out.println(
                "Loaded ontology from " + ontologyFilename + " with " + ontology.getAxiomCount() + " axioms.");

        ArrayList<String> ontologyClasses = new ArrayList<String>();
        for (OWLClass owlClass : ontology.getClassesInSignature()) {
            ontologyClasses.add(owlClass.getIRI().toString());
        }
        System.out.println("Adding " + ontologyClasses.size() + " classes from ontology.");
        config.setStrings(TuplesToResourcesMapper.EXTRA_RESOURCES, ontologyClasses.toArray(new String[0]));
    } else {
        System.out.println("No ontology filename set in conf.  No ontology has been loaded.");
    }

    Job job = Job.getInstance(config);
    job.setJarByClass(PrepTool.class);

    job.setJobName(PrepTool.class.getName() + "-part1-" + System.currentTimeMillis());
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TuplesToResourcesMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    int reducerCount = jsapResult.getInt(REDUCER_COUNT_ARG, DEFAULT_REDUCER_COUNT);
    job.setNumReduceTasks(reducerCount);
    if (reducerCount == 1) {
        // We assign 'global' ids in the reducer. For this to work, there
        // can be only one. But using just one reducer, we run out of local disk space during the
        // pre-reduce merge with big data sets like WCC.

        job.setReducerClass(ResourcesReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Object.class);
        job.setOutputFormatClass(ResourceRecordWriter.OutputFormat.class);
    } else {
        /*
         * TODO: Take the functionality of the reducer and move it to run on
         * the gateway. We then use n identity reducers, the output of which
         * will be read and merged as streams on the gateway.
         */
    }

    FileInputFormat.setInputPaths(job, new Path(jsapResult.getString(INPUT_ARG)));

    Path outputDir = new Path(jsapResult.getString(OUTPUT_ARG));
    FileOutputFormat.setOutputPath(job, outputDir);

    if (!job.waitForCompletion(true)) {
        System.err.println("Failed to process tuples from " + jsapResult.getString(INPUT_ARG));
        return 1;
    }

    // IF THERE WAS ONLY ONE REDUCER WE NOW HAVE
    // One file per reducer containing lists of urls(recourses) for
    // subjects, predicates, objects and contexts.
    // One file per reducer that contains all resources. subjects +
    // predicates + objects + contexts.
    // One file per reducer that contains the subjects + all <predicate>
    // <object>|"Literal" <context> on that subject.

    // IF THERE WAS MORE THAN ONE REDUCER WE NOW HAVE N FILES THAT NEED TO BE MERGED ON THE GATEWAY. TODO.

    return 0;
}

From source file:com.yahoo.labs.yamall.hadoop.Test.java

License:Open Source License

/**
 * Run the map/reduce job/*from w  ww. j a  va 2 s .c  o m*/
 */
public final int run(final String[] args) throws Exception {

    startLogger(Level.INFO);

    Configuration conf = getConf();
    conf.set("yamall.vw_model", args[2]);
    conf.setIfUnset("yamall.bit_precision", "18");
    conf.setIfUnset("yamall.parser", "vw");

    // Print to screen all the options
    TreeMap<String, String> map = new TreeMap<String, String>();
    for (Map.Entry<String, String> entry : conf) {
        map.put(entry.getKey(), entry.getValue());
    }
    for (Map.Entry<String, String> entry : map.entrySet()) {
        System.out.printf("%s=%s\n", entry.getKey(), entry.getValue());
    }

    Job job = Job.getInstance(conf, "Yamall Test on MapReduce");
    job.setNumReduceTasks(1);
    job.setJarByClass(Test.class);
    job.setMapperClass(TestMapper.class);
    job.setMapOutputKeyClass(DoubleWritable.class);
    job.setReducerClass(TestReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(CompositeDoubleTextWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    MultipleOutputs.addNamedOutput(job, "out", TextOutputFormat.class, NullWritable.class, Text.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.yahoo.labs.yamall.hadoop.Train.java

License:Open Source License

/**
 * Run the map/reduce job//from  w w w .  j a  v  a  2  s.c o  m
 */
public final int run(final String[] args) throws Exception {

    startLogger(Level.INFO);

    Configuration conf = getConf();
    conf.set("yamall.output", args[1]);
    conf.setIfUnset("yamall.bit_precision", "18");
    conf.setIfUnset("yamall.parser", "vw");

    // Print to screen all the options
    TreeMap<String, String> map = new TreeMap<String, String>();
    for (Map.Entry<String, String> entry : conf) {
        map.put(entry.getKey(), entry.getValue());
    }
    for (Map.Entry<String, String> entry : map.entrySet()) {
        System.out.printf("%s=%s\n", entry.getKey(), entry.getValue());
    }

    Job job = Job.getInstance(conf, "Yamall Train on MapReduce");
    job.setNumReduceTasks(1); // important
    job.setJarByClass(Train.class);
    job.setMapperClass(TrainMapper.class);
    job.setMapOutputKeyClass(DoubleWritable.class);
    job.setMapOutputValueClass(InstanceOrHashMapWritable.class);
    job.setReducerClass(TrainReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.yahoo.semsearch.fastlinking.io.RepackWikipedia.java

License:Apache License

@SuppressWarnings("static-access")
@Override//from w w w .j  av  a2 s  . co  m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file")
            .create(MAPPING_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("block|record|none").hasArg()
            .withDescription("compression type").create(COMPRESSION_TYPE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg().withDescription("two-letter language code")
            .create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION);
    String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION);

    if (!"block".equals(compressionType) && !"record".equals(compressionType)
            && !"none".equals(compressionType)) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    // this is the default block size
    int blocksize = 1000000;

    Job job = Job.getInstance(getConf());
    job.setJarByClass(RepackWikipedia.class);
    job.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language));

    job.getConfiguration().set(DOCNO_MAPPING_FIELD, mappingFile);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - docno mapping data file: " + mappingFile);
    LOG.info(" - compression type: " + compressionType);
    LOG.info(" - language: " + language);

    if ("block".equals(compressionType)) {
        LOG.info(" - block size: " + blocksize);
    }

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    if ("none".equals(compressionType)) {
        FileOutputFormat.setCompressOutput(job, false);
    } else {
        FileOutputFormat.setCompressOutput(job, true);

        if ("record".equals(compressionType)) {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
            job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }

    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(WikipediaPageFactory.getWikipediaPageClass(language));
    //job.setOutputValueClass(EnglishWikipediaPage.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    return job.waitForCompletion(true) ? 0 : -1;

}

From source file:com.yahoo.semsearch.fastlinking.io.WikipediaDocnoMappingBuilder.java

License:Apache License

@SuppressWarnings("static-access")
@Override//w w  w  . ja  v  a  2 s .c o  m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file")
            .create(OUTPUT_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr|it").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));
    options.addOption(KEEP_ALL_OPTION, false, "keep all pages");

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION);
    boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION);

    String tmpPath = "tmp-" + WikipediaDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output file: " + outputFile);
    LOG.info(" - keep all pages: " + keepAll);
    LOG.info(" - language: " + language);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(WikipediaDocnoMappingBuilder.class);
    job.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language));

    job.getConfiguration().setBoolean(KEEP_ALL_OPTION, keepAll);
    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(tmpPath), true);

    if (job.waitForCompletion(true)) {

        //         long cnt = keepAll ? job.getCounters().findCounter(PageTypes.TOTAL).getValue() : job.getCounters().findCounter(PageTypes.ARTICLE).getValue();
        long cnt = job.getCounters()
                .findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue();
        WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(getConf()), tmpPath + "/part-r-00000",
                (int) cnt, outputFile);
        FileSystem.get(getConf()).delete(new Path(tmpPath), true);
        return 0;

    } else {
        return -1;
    }
}