Example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass.

Prototype

public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException

Source Link

Document

Set the OutputFormat for the job.

Usage

From source file:com.yahoo.glimmer.indexing.preprocessor.PrepTool.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {

    SimpleJSAP jsap = new SimpleJSAP(PrepTool.class.getName(), "RDF tuples pre-processor for Glimmer",
            new Parameter[] {
                    new Switch(NO_CONTEXTS_ARG, 'C', NO_CONTEXTS_ARG,
                            "Don't process the contexts for each tuple."),
                    new FlaggedOption(ONTOLOGY_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'O',
                            ONTOLOGY_ARG),
                    new FlaggedOption(REDUCER_COUNT_ARG, JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT,
                            JSAP.NOT_REQUIRED, 'r', REDUCER_COUNT_ARG),
                    new UnflaggedOption(INPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the input data."),
                    new UnflaggedOption(OUTPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the out data."), });

    JSAPResult jsapResult = jsap.parse(args);
    if (!jsapResult.success()) {
        System.err.print(jsap.getUsage());
        System.exit(1);/* ww  w.j a  va  2 s.  co m*/
    }

    Configuration config = getConf();

    boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false);
    config.setBoolean(TuplesToResourcesMapper.INCLUDE_CONTEXTS_KEY, withContexts);

    // The ontology if any...
    String ontologyFilename = jsapResult.getString(ONTOLOGY_ARG);
    if (ontologyFilename != null) {
        // Load the ontology
        InputStream ontologyInputStream = new FileInputStream(ontologyFilename);
        OWLOntology ontology = OntologyLoader.load(ontologyInputStream);
        System.out.println(
                "Loaded ontology from " + ontologyFilename + " with " + ontology.getAxiomCount() + " axioms.");

        ArrayList<String> ontologyClasses = new ArrayList<String>();
        for (OWLClass owlClass : ontology.getClassesInSignature()) {
            ontologyClasses.add(owlClass.getIRI().toString());
        }
        System.out.println("Adding " + ontologyClasses.size() + " classes from ontology.");
        config.setStrings(TuplesToResourcesMapper.EXTRA_RESOURCES, ontologyClasses.toArray(new String[0]));
    } else {
        System.out.println("No ontology filename set in conf.  No ontology has been loaded.");
    }

    Job job = Job.getInstance(config);
    job.setJarByClass(PrepTool.class);

    job.setJobName(PrepTool.class.getName() + "-part1-" + System.currentTimeMillis());
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TuplesToResourcesMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    int reducerCount = jsapResult.getInt(REDUCER_COUNT_ARG, DEFAULT_REDUCER_COUNT);
    job.setNumReduceTasks(reducerCount);
    if (reducerCount == 1) {
        // We assign 'global' ids in the reducer. For this to work, there
        // can be only one. But using just one reducer, we run out of local disk space during the
        // pre-reduce merge with big data sets like WCC.

        job.setReducerClass(ResourcesReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Object.class);
        job.setOutputFormatClass(ResourceRecordWriter.OutputFormat.class);
    } else {
        /*
         * TODO: Take the functionality of the reducer and move it to run on
         * the gateway. We then use n identity reducers, the output of which
         * will be read and merged as streams on the gateway.
         */
    }

    FileInputFormat.setInputPaths(job, new Path(jsapResult.getString(INPUT_ARG)));

    Path outputDir = new Path(jsapResult.getString(OUTPUT_ARG));
    FileOutputFormat.setOutputPath(job, outputDir);

    if (!job.waitForCompletion(true)) {
        System.err.println("Failed to process tuples from " + jsapResult.getString(INPUT_ARG));
        return 1;
    }

    // IF THERE WAS ONLY ONE REDUCER WE NOW HAVE
    // One file per reducer containing lists of urls(recourses) for
    // subjects, predicates, objects and contexts.
    // One file per reducer that contains all resources. subjects +
    // predicates + objects + contexts.
    // One file per reducer that contains the subjects + all <predicate>
    // <object>|"Literal" <context> on that subject.

    // IF THERE WAS MORE THAN ONE REDUCER WE NOW HAVE N FILES THAT NEED TO BE MERGED ON THE GATEWAY. TODO.

    return 0;
}

From source file:com.yahoo.semsearch.fastlinking.io.RepackWikipedia.java

License:Apache License

@SuppressWarnings("static-access")
@Override//w ww  . j a v a2 s.c o m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file")
            .create(MAPPING_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("block|record|none").hasArg()
            .withDescription("compression type").create(COMPRESSION_TYPE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg().withDescription("two-letter language code")
            .create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION);
    String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION);

    if (!"block".equals(compressionType) && !"record".equals(compressionType)
            && !"none".equals(compressionType)) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    // this is the default block size
    int blocksize = 1000000;

    Job job = Job.getInstance(getConf());
    job.setJarByClass(RepackWikipedia.class);
    job.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language));

    job.getConfiguration().set(DOCNO_MAPPING_FIELD, mappingFile);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - docno mapping data file: " + mappingFile);
    LOG.info(" - compression type: " + compressionType);
    LOG.info(" - language: " + language);

    if ("block".equals(compressionType)) {
        LOG.info(" - block size: " + blocksize);
    }

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    if ("none".equals(compressionType)) {
        FileOutputFormat.setCompressOutput(job, false);
    } else {
        FileOutputFormat.setCompressOutput(job, true);

        if ("record".equals(compressionType)) {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
            job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }

    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(WikipediaPageFactory.getWikipediaPageClass(language));
    //job.setOutputValueClass(EnglishWikipediaPage.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    return job.waitForCompletion(true) ? 0 : -1;

}

From source file:com.yahoo.semsearch.fastlinking.io.WikipediaDocnoMappingBuilder.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from w  ww  .  j a  v a  2s. c om*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file")
            .create(OUTPUT_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr|it").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));
    options.addOption(KEEP_ALL_OPTION, false, "keep all pages");

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION);
    boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION);

    String tmpPath = "tmp-" + WikipediaDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output file: " + outputFile);
    LOG.info(" - keep all pages: " + keepAll);
    LOG.info(" - language: " + language);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(WikipediaDocnoMappingBuilder.class);
    job.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language));

    job.getConfiguration().setBoolean(KEEP_ALL_OPTION, keepAll);
    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(tmpPath), true);

    if (job.waitForCompletion(true)) {

        //         long cnt = keepAll ? job.getCounters().findCounter(PageTypes.TOTAL).getValue() : job.getCounters().findCounter(PageTypes.ARTICLE).getValue();
        long cnt = job.getCounters()
                .findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue();
        WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(getConf()), tmpPath + "/part-r-00000",
                (int) cnt, outputFile);
        FileSystem.get(getConf()).delete(new Path(tmpPath), true);
        return 0;

    } else {
        return -1;
    }
}

From source file:com.yahoo.semsearch.fastlinking.utils.RunFELOntheGrid.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf);
    //Job job = Job.getInstance( conf );
    job.setJarByClass(RunFELOntheGrid.class);
    // Process custom command-line options
    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Specify various job-specific parameters
    job.setJobName("Entity Linker");
    job.setNumReduceTasks(100);//  w  ww.  ja va2  s . c  o m
    job.setJarByClass(RunFELOntheGrid.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setMapperClass(FELMapper.class);
    job.setReducerClass(FELReducer.class);
    job.setCombinerClass(FELReducer.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:com.yahoo.semsearch.fastlinking.w2v.EntityEmbeddings.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf);
    //Job job = Job.getInstance( conf );
    job.setJarByClass(EntityEmbeddings.class);
    // Process custom command-line options
    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Specify various job-specific parameters
    job.setJobName("Entity embeddings");
    job.setNumReduceTasks(1);/*from w  w  w .  j  av  a2s  .c o m*/
    job.setJarByClass(EntityEmbeddings.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(EntityEMapper.class);
    job.setReducerClass(EntityEReducer.class);
    job.setCombinerClass(EntityEReducer.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:com.yahoo.ycsb.bulk.hbase.BulkDataGeneratorJob.java

License:Apache License

/**
 * Parameters for bulk loader specified through the config file:
 *
 * - prefix for the row keys//  ww w  .  j  a  va 2  s  .c o m
 * - range start
 * - range end (inclusive)
 * - num splits (or number of partitions).
 * - user
 * - password
 * - table
 *
 * For the accepted default options
 * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
 */
public int run(String[] args) throws Exception {
    Configuration conf = this.getConf();

    Util.printArgs("run", args, System.err);
    printKeyValues(conf, ARG_KEYS, System.err);

    if (args.length > 1 || (args.length == 1 && "-help".compareToIgnoreCase(args[0]) == 0)) {
        System.err.println("Usage: " + this.getClass().getName()
                + "input_path [generic options] [input_paths...] ouptut_path");
        GenericOptionsParser.printGenericCommandUsage(System.err);
        return 1;
    }

    // Time run
    long startTime = System.currentTimeMillis();
    String workdir;

    if (args.length == 1) {
        /* override workdir in the config if it is specified in the
         * command line
         */
        conf.set(ARG_KEY_OUTDIR, args[0]);
        workdir = args[0];
    }

    workdir = conf.get(ARG_KEY_OUTDIR);

    if (workdir == null) {
        System.err.println("No output directory specified");
        return 1;
    }

    /* Initialize job, check parameters and decide which mapper to use */
    Job job = new Job(conf, conf.get(ARG_KEY_JOBNAME, "YCSB KV data generator"));

    /* these settings are the same (i.e., fixed) independent of the
     * parameters */
    job.setJarByClass(this.getClass());
    // job.setInputFormatClass(TextInputFormat.class);
    job.setInputFormatClass(NLineInputFormat.class);

    /* these settings should depend on the type of output file */
    job.setOutputFormatClass(HFileOutputFormat.class);
    /* not sure the next two are needed */
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(KeyValue.class);

    this.createInputFile(job, workdir);

    HFileOutputFormat.setOutputPath(job, new Path(workdir + "/files"));

    /* depending on whether the keys need to be sorted and hashed, then
     * decide which mapper and reducer to use 
     */
    boolean hashKeys = conf.getBoolean(ARG_KEY_HASH_KEYS, false);
    boolean sortKeys = conf.getBoolean(ARG_KEY_SORTKEYS, true);

    /* get splits file name: side-effect -> this may generate a splits file  */
    String splitsfile = this.getSplitsFile(job, workdir);

    if (sortKeys && hashKeys) { /* do a full map reduce job */
        job.setMapperClass(RowGeneratorMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setPartitionerClass(RangePartitioner.class);

        if (splitsfile == null) {
            /* Auto generate the splits file either from:
             * - the input key ranges
             * - from the current table splits
             */
            throw new InvalidInputException("No splits specified");
        }

        /* Set splits file */
        RangePartitioner.setSplitFile(job, splitsfile);

        /* Add reducer (based on mapper code) */
        job.setReducerClass(RowGeneratorReduce.class);

        /* the number of reducers is dependent on the number of
         * partitions
         */
        int numReduce = conf.getInt(ARG_KEY_NUMREDUCE, 1);
        job.setNumReduceTasks(numReduce);
    } else { /* perform a map only job */
        job.setMapperClass(RowGeneratorMapOnly.class);
        /* map output key and value types are the same as
         * for the job
         */
        job.setMapOutputKeyClass(job.getOutputKeyClass());
        job.setMapOutputValueClass(job.getOutputValueClass());
        job.setNumReduceTasks(0);
    }

    job.waitForCompletion(true);

    //        JobClient.runJob(conf);
    SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd HH:mm:ss.SSS z");
    SimpleDateFormat ddf = new SimpleDateFormat("HH:mm:ss.SSS");
    ddf.setTimeZone(TimeZone.getTimeZone("UTC"));
    long endTime = System.currentTimeMillis();
    System.out.println("Start time (ms): " + df.format(new Date(startTime)) + " -- " + startTime);
    System.out.println("End time (ms): " + df.format(new Date(endTime)) + " -- " + endTime);
    System.out
            .println("Elapsed time (ms): " + ddf.format(endTime - startTime) + " -- " + (endTime - startTime));
    return 0;
}

From source file:com.yassergonzalez.pagerank.PageRank.java

License:Apache License

private void pageRankIteration(int iter, Configuration conf, Path outputDir) throws Exception {

    // This job performs an iteration of the power iteration method to
    // compute PageRank. The map task processes each block M_{i,j}, loads
    // the corresponding stripe j of the vector v_{k-1} and produces the
    // partial result of the stripe i of the vector v_k. The reduce task
    // sums all the partial results of v_k and adds the teleportation factor
    // (the combiner only sums all the partial results). See Section 5.2
    // (and 5.2.3 in particular) of Mining of Massive Datasets
    // (http://infolab.stanford.edu/~ullman/mmds.html) for details. The
    // output is written in a "vk" subdir of the output dir, where k is the
    // iteration number. MapFileOutputFormat is used to keep an array of the
    // stripes of v.

    Job job = Job.getInstance(conf, "PageRank:Iteration");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(PageRankIterationMapper.class);
    job.setMapOutputKeyClass(ShortWritable.class);
    job.setMapOutputValueClass(FloatArrayWritable.class);
    job.setCombinerClass(PageRankIterationCombiner.class);
    job.setReducerClass(PageRankIterationReducer.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(ShortWritable.class);
    job.setOutputValueClass(FloatArrayWritable.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "M"));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter));

    job.waitForCompletion(true);// w  w w  .  j a v a 2 s.  c o  m
}

From source file:com.yourcompany.hadoop.mapreduce.hcatalog.HCatalogExampleDriver.java

License:Apache License

public int run(String[] args) throws Exception {
    Job job = new Job();
    parseArguements(args, job);//  w  ww . j av  a2s.  c o  m

    job.setJarByClass(HCatalogExampleDriver.class);

    job.setInputFormatClass(HCatInputFormat.class);
    job.setOutputFormatClass(HCatOutputFormat.class);

    job.setMapperClass(HCatalogExampleMapper.class);
    job.setReducerClass(HCatalogExampleReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(WritableComparable.class);
    job.setOutputValueClass(DefaultHCatRecord.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.zjy.mongo.util.MongoTool.java

License:Apache License

private int runMapReduceJob(final Configuration conf) throws IOException {
    final Job job = Job.getInstance(conf, getJobName());
    /**//from   w w w .  jav a  2  s  . c  om
     * Any arguments specified with -D <property>=<value>
     * on the CLI will be picked up and set here
     * They override any XML level values
     * Note that -D<space> is important - no space will
     * not work as it gets picked up by Java itself
     */
    // TODO - Do we need to set job name somehow more specifically?
    // This may or may not be correct/sane
    job.setJarByClass(getClass());
    final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf);

    if (LOG.isDebugEnabled()) {
        LOG.debug("Mapper Class: " + mapper);
        LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI));
    }
    job.setMapperClass(mapper);
    Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf);
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }
    job.setReducerClass(MongoConfigUtil.getReducer(conf));

    job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf));
    job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf));
    job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf));
    job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf));
    Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf);
    Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf);

    if (mapOutputKeyClass != null) {
        job.setMapOutputKeyClass(mapOutputKeyClass);
    }
    if (mapOutputValueClass != null) {
        job.setMapOutputValueClass(mapOutputValueClass);
    }

    /**
     * Determines if the job will run verbosely e.g. print debug output
     * Only works with foreground jobs
     */
    final boolean verbose = MongoConfigUtil.isJobVerbose(conf);
    /**
     * Run job in foreground aka wait for completion or background?
     */
    final boolean background = MongoConfigUtil.isJobBackground(conf);
    try {
        if (background) {
            LOG.info("Setting up and running MapReduce job in background.");
            job.submit();
            return 0;
        } else {
            LOG.info("Setting up and running MapReduce job in foreground, will wait for results.  {Verbose? "
                    + verbose + "}");
            return job.waitForCompletion(true) ? 0 : 1;
        }
    } catch (final Exception e) {
        LOG.error("Exception while executing job... ", e);
        return 1;
    }
}

From source file:cosmos.mapred.MediawikiIngestJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (1 != args.length) {
        System.err.println("Usage: input.xml,input.xml,input.xml...");
        return 1;
    }/*from  w  w w.j av a  2 s. c o m*/

    String inputFiles = args[0];

    Configuration conf = getConf();
    System.out.println("path " + conf.get("fs.default.name"));
    conf.addResource(new Path("/opt/hadoop/conf/hdfs-site.xml"));
    conf.addResource(new Path("/opt/hadoop/conf/core-site.xml"));

    conf.addResource(new Path("/opt/hadoop/conf/mapred-site.xml"));

    System.out.println("path " + conf.get("fs.default.name"));
    //System.exit(1);
    Job job = new Job(conf, "Mediawiki Ingest");

    job.setJarByClass(MediawikiIngestJob.class);

    String tablename = "sortswiki";
    String zookeepers = "localhost:2181";
    String instanceName = "accumulo";
    String user = "root";
    PasswordToken passwd = new PasswordToken("secret");

    FileInputFormat.setInputPaths(job, inputFiles);

    job.setMapperClass(MediawikiMapper.class);
    job.setNumReduceTasks(0);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Mutation.class);
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    BatchWriterConfig bwConfig = new BatchWriterConfig();

    job.setInputFormatClass(MediawikiInputFormat.class);
    AccumuloOutputFormat.setZooKeeperInstance(job, instanceName, zookeepers);
    AccumuloOutputFormat.setConnectorInfo(job, user, passwd);
    AccumuloOutputFormat.setBatchWriterOptions(job, bwConfig);
    AccumuloOutputFormat.setCreateTables(job, true);
    AccumuloOutputFormat.setDefaultTableName(job, tablename);

    return job.waitForCompletion(true) ? 0 : 1;
}