Example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass.

Prototype

public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the OutputFormat for the job.

Usage

From source file:com.yahoo.glimmer.indexing.preprocessor.PrepTool.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {

    SimpleJSAP jsap = new SimpleJSAP(PrepTool.class.getName(), "RDF tuples pre-processor for Glimmer",
            new Parameter[] {
                    new Switch(NO_CONTEXTS_ARG, 'C', NO_CONTEXTS_ARG,
                            "Don't process the contexts for each tuple."),
                    new FlaggedOption(ONTOLOGY_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'O',
                            ONTOLOGY_ARG),
                    new FlaggedOption(REDUCER_COUNT_ARG, JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT,
                            JSAP.NOT_REQUIRED, 'r', REDUCER_COUNT_ARG),
                    new UnflaggedOption(INPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the input data."),
                    new UnflaggedOption(OUTPUT_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the out data."), });

    JSAPResult jsapResult = jsap.parse(args);
    if (!jsapResult.success()) {
        System.err.print(jsap.getUsage());
        System.exit(1);/* ww  w.j a  va  2 s.  co m*/
    }

    Configuration config = getConf();

    boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false);
    config.setBoolean(TuplesToResourcesMapper.INCLUDE_CONTEXTS_KEY, withContexts);

    // The ontology if any...
    String ontologyFilename = jsapResult.getString(ONTOLOGY_ARG);
    if (ontologyFilename != null) {
        // Load the ontology
        InputStream ontologyInputStream = new FileInputStream(ontologyFilename);
        OWLOntology ontology = OntologyLoader.load(ontologyInputStream);
        System.out.println(
                "Loaded ontology from " + ontologyFilename + " with " + ontology.getAxiomCount() + " axioms.");

        ArrayList<String> ontologyClasses = new ArrayList<String>();
        for (OWLClass owlClass : ontology.getClassesInSignature()) {
            ontologyClasses.add(owlClass.getIRI().toString());
        }
        System.out.println("Adding " + ontologyClasses.size() + " classes from ontology.");
        config.setStrings(TuplesToResourcesMapper.EXTRA_RESOURCES, ontologyClasses.toArray(new String[0]));
    } else {
        System.out.println("No ontology filename set in conf.  No ontology has been loaded.");
    }

    Job job = Job.getInstance(config);
    job.setJarByClass(PrepTool.class);

    job.setJobName(PrepTool.class.getName() + "-part1-" + System.currentTimeMillis());
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(TuplesToResourcesMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    int reducerCount = jsapResult.getInt(REDUCER_COUNT_ARG, DEFAULT_REDUCER_COUNT);
    job.setNumReduceTasks(reducerCount);
    if (reducerCount == 1) {
        // We assign 'global' ids in the reducer. For this to work, there
        // can be only one. But using just one reducer, we run out of local disk space during the
        // pre-reduce merge with big data sets like WCC.

        job.setReducerClass(ResourcesReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Object.class);
        job.setOutputFormatClass(ResourceRecordWriter.OutputFormat.class);
    } else {
        /*
         * TODO: Take the functionality of the reducer and move it to run on
         * the gateway. We then use n identity reducers, the output of which
         * will be read and merged as streams on the gateway.
         */
    }

    FileInputFormat.setInputPaths(job, new Path(jsapResult.getString(INPUT_ARG)));

    Path outputDir = new Path(jsapResult.getString(OUTPUT_ARG));
    FileOutputFormat.setOutputPath(job, outputDir);

    if (!job.waitForCompletion(true)) {
        System.err.println("Failed to process tuples from " + jsapResult.getString(INPUT_ARG));
        return 1;
    }

    // IF THERE WAS ONLY ONE REDUCER WE NOW HAVE
    // One file per reducer containing lists of urls(recourses) for
    // subjects, predicates, objects and contexts.
    // One file per reducer that contains all resources. subjects +
    // predicates + objects + contexts.
    // One file per reducer that contains the subjects + all <predicate>
    // <object>|"Literal" <context> on that subject.

    // IF THERE WAS MORE THAN ONE REDUCER WE NOW HAVE N FILES THAT NEED TO BE MERGED ON THE GATEWAY. TODO.

    return 0;
}

From source file:com.yahoo.semsearch.fastlinking.io.RepackWikipedia.java

License:Apache License

@SuppressWarnings("static-access")
@Override//w ww  . j a v a2 s.c o m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file")
            .create(MAPPING_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("block|record|none").hasArg()
            .withDescription("compression type").create(COMPRESSION_TYPE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg().withDescription("two-letter language code")
            .create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION);
    String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION);

    if (!"block".equals(compressionType) && !"record".equals(compressionType)
            && !"none".equals(compressionType)) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    // this is the default block size
    int blocksize = 1000000;

    Job job = Job.getInstance(getConf());
    job.setJarByClass(RepackWikipedia.class);
    job.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language));

    job.getConfiguration().set(DOCNO_MAPPING_FIELD, mappingFile);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - docno mapping data file: " + mappingFile);
    LOG.info(" - compression type: " + compressionType);
    LOG.info(" - language: " + language);

    if ("block".equals(compressionType)) {
        LOG.info(" - block size: " + blocksize);
    }

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    if ("none".equals(compressionType)) {
        FileOutputFormat.setCompressOutput(job, false);
    } else {
        FileOutputFormat.setCompressOutput(job, true);

        if ("record".equals(compressionType)) {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
            job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }

    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(WikipediaPageFactory.getWikipediaPageClass(language));
    //job.setOutputValueClass(EnglishWikipediaPage.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    return job.waitForCompletion(true) ? 0 : -1;

}

From source file:com.yahoo.semsearch.fastlinking.io.WikipediaDocnoMappingBuilder.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from w  ww  .  j a  v a  2s. c om*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file")
            .create(OUTPUT_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr|it").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));
    options.addOption(KEEP_ALL_OPTION, false, "keep all pages");

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION);
    boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION);

    String tmpPath = "tmp-" + WikipediaDocnoMappingBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output file: " + outputFile);
    LOG.info(" - keep all pages: " + keepAll);
    LOG.info(" - language: " + language);

    Job job = Job.getInstance(getConf());
    job.setJarByClass(WikipediaDocnoMappingBuilder.class);
    job.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language));

    job.getConfiguration().setBoolean(KEEP_ALL_OPTION, keepAll);
    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(job, false);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(tmpPath), true);

    if (job.waitForCompletion(true)) {

        //         long cnt = keepAll ? job.getCounters().findCounter(PageTypes.TOTAL).getValue() : job.getCounters().findCounter(PageTypes.ARTICLE).getValue();
        long cnt = job.getCounters()
                .findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS").getValue();
        WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(getConf()), tmpPath + "/part-r-00000",
                (int) cnt, outputFile);
        FileSystem.get(getConf()).delete(new Path(tmpPath), true);
        return 0;

    } else {
        return -1;
    }
}

From source file:com.yahoo.semsearch.fastlinking.utils.RunFELOntheGrid.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf);
    //Job job = Job.getInstance( conf );
    job.setJarByClass(RunFELOntheGrid.class);
    // Process custom command-line options
    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Specify various job-specific parameters
    job.setJobName("Entity Linker");
    job.setNumReduceTasks(100);//  w  ww.  ja va2  s . c  o m
    job.setJarByClass(RunFELOntheGrid.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setMapperClass(FELMapper.class);
    job.setReducerClass(FELReducer.class);
    job.setCombinerClass(FELReducer.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:com.yahoo.semsearch.fastlinking.w2v.EntityEmbeddings.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf);
    //Job job = Job.getInstance( conf );
    job.setJarByClass(EntityEmbeddings.class);
    // Process custom command-line options
    Path in = new Path(args[0]);
    Path out = new Path(args[1]);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Specify various job-specific parameters
    job.setJobName("Entity embeddings");
    job.setNumReduceTasks(1);/*from w  w  w .  j  av  a2s  .c o m*/
    job.setJarByClass(EntityEmbeddings.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(EntityEMapper.class);
    job.setReducerClass(EntityEReducer.class);
    job.setCombinerClass(EntityEReducer.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:com.yahoo.ycsb.bulk.hbase.BulkDataGeneratorJob.java

License:Apache License

/**
 * Parameters for bulk loader specified through the config file:
 *
 * - prefix for the row keys//  ww w  .  j  a  va 2  s  .c o m
 * - range start
 * - range end (inclusive)
 * - num splits (or number of partitions).
 * - user
 * - password
 * - table
 *
 * For the accepted default options
 * @see org.apache.hadoop.util.Tool#run(java.lang.String[])
 */
public int run(String[] args) throws Exception {
    Configuration conf = this.getConf();

    Util.printArgs("run", args, System.err);
    printKeyValues(conf, ARG_KEYS, System.err);

    if (args.length > 1 || (args.length == 1 && "-help".compareToIgnoreCase(args[0]) == 0)) {
        System.err.println("Usage: " + this.getClass().getName()
                + "input_path [generic options] [input_paths...] ouptut_path");
        GenericOptionsParser.printGenericCommandUsage(System.err);
        return 1;
    }

    // Time run
    long startTime = System.currentTimeMillis();
    String workdir;

    if (args.length == 1) {
        /* override workdir in the config if it is specified in the
         * command line
         */
        conf.set(ARG_KEY_OUTDIR, args[0]);
        workdir = args[0];
    }

    workdir = conf.get(ARG_KEY_OUTDIR);

    if (workdir == null) {
        System.err.println("No output directory specified");
        return 1;
    }

    /* Initialize job, check parameters and decide which mapper to use */
    Job job = new Job(conf, conf.get(ARG_KEY_JOBNAME, "YCSB KV data generator"));

    /* these settings are the same (i.e., fixed) independent of the
     * parameters */
    job.setJarByClass(this.getClass());
    // job.setInputFormatClass(TextInputFormat.class);
    job.setInputFormatClass(NLineInputFormat.class);

    /* these settings should depend on the type of output file */
    job.setOutputFormatClass(HFileOutputFormat.class);
    /* not sure the next two are needed */
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(KeyValue.class);

    this.createInputFile(job, workdir);

    HFileOutputFormat.setOutputPath(job, new Path(workdir + "/files"));

    /* depending on whether the keys need to be sorted and hashed, then
     * decide which mapper and reducer to use 
     */
    boolean hashKeys = conf.getBoolean(ARG_KEY_HASH_KEYS, false);
    boolean sortKeys = conf.getBoolean(ARG_KEY_SORTKEYS, true);

    /* get splits file name: side-effect -> this may generate a splits file  */
    String splitsfile = this.getSplitsFile(job, workdir);

    if (sortKeys && hashKeys) { /* do a full map reduce job */
        job.setMapperClass(RowGeneratorMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setPartitionerClass(RangePartitioner.class);

        if (splitsfile == null) {
            /* Auto generate the splits file either from:
             * - the input key ranges
             * - from the current table splits
             */
            throw new InvalidInputException("No splits specified");
        }

        /* Set splits file */
        RangePartitioner.setSplitFile(job, splitsfile);

        /* Add reducer (based on mapper code) */
        job.setReducerClass(RowGeneratorReduce.class);

        /* the number of reducers is dependent on the number of
         * partitions
         */
        int numReduce = conf.getInt(ARG_KEY_NUMREDUCE, 1);
        job.setNumReduceTasks(numReduce);
    } else { /* perform a map only job */
        job.setMapperClass(RowGeneratorMapOnly.class);
        /* map output key and value types are the same as
         * for the job
         */
        job.setMapOutputKeyClass(job.getOutputKeyClass());
        job.setMapOutputValueClass(job.getOutputValueClass());
        job.setNumReduceTasks(0);
    }

    job.waitForCompletion(true);

    //        JobClient.runJob(conf);
    SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd HH:mm:ss.SSS z");
    SimpleDateFormat ddf = new SimpleDateFormat("HH:mm:ss.SSS");
    ddf.setTimeZone(TimeZone.getTimeZone("UTC"));
    long endTime = System.currentTimeMillis();
    System.out.println("Start time (ms): " + df.format(new Date(startTime)) + " -- " + startTime);
    System.out.println("End time (ms): " + df.format(new Date(endTime)) + " -- " + endTime);
    System.out
            .println("Elapsed time (ms): " + ddf.format(endTime - startTime) + " -- " + (endTime - startTime));
    return 0;
}

From source file:com.yassergonzalez.pagerank.PageRank.java

License:Apache License

private void pageRankIteration(int iter, Configuration conf, Path outputDir) throws Exception {

    // This job performs an iteration of the power iteration method to
    // compute PageRank. The map task processes each block M_{i,j}, loads
    // the corresponding stripe j of the vector v_{k-1} and produces the
    // partial result of the stripe i of the vector v_k. The reduce task
    // sums all the partial results of v_k and adds the teleportation factor
    // (the combiner only sums all the partial results). See Section 5.2
    // (and 5.2.3 in particular) of Mining of Massive Datasets
    // (http://infolab.stanford.edu/~ullman/mmds.html) for details. The
    // output is written in a "vk" subdir of the output dir, where k is the
    // iteration number. MapFileOutputFormat is used to keep an array of the
    // stripes of v.

    Job job = Job.getInstance(conf, "PageRank:Iteration");

    job.setJarByClass(PageRank.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(PageRankIterationMapper.class);
    job.setMapOutputKeyClass(ShortWritable.class);
    job.setMapOutputValueClass(FloatArrayWritable.class);
    job.setCombinerClass(PageRankIterationCombiner.class);
    job.setReducerClass(PageRankIterationReducer.class);
    job.setOutputFormatClass(MapFileOutputFormat.class);
    job.setOutputKeyClass(ShortWritable.class);
    job.setOutputValueClass(FloatArrayWritable.class);
    FileInputFormat.addInputPath(job, new Path(outputDir, "M"));
    FileOutputFormat.setOutputPath(job, new Path(outputDir, "v" + iter));

    job.waitForCompletion(true);// w  w w  .  j a v a 2 s.  c o  m
}

From source file:com.yourcompany.hadoop.mapreduce.hcatalog.HCatalogExampleDriver.java

License:Apache License

public int run(String[] args) throws Exception {
    Job job = new Job();
    parseArguements(args, job);//  w  ww . j av  a2s.  c o  m

    job.setJarByClass(HCatalogExampleDriver.class);

    job.setInputFormatClass(HCatInputFormat.class);
    job.setOutputFormatClass(HCatOutputFormat.class);

    job.setMapperClass(HCatalogExampleMapper.class);
    job.setReducerClass(HCatalogExampleReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(WritableComparable.class);
    job.setOutputValueClass(DefaultHCatRecord.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.zjy.mongo.util.MongoTool.java

License:Apache License

private int runMapReduceJob(final Configuration conf) throws IOException {
    final Job job = Job.getInstance(conf, getJobName());
    /**//from   w w w .  jav a  2  s  . c  om
     * Any arguments specified with -D <property>=<value>
     * on the CLI will be picked up and set here
     * They override any XML level values
     * Note that -D<space> is important - no space will
     * not work as it gets picked up by Java itself
     */
    // TODO - Do we need to set job name somehow more specifically?
    // This may or may not be correct/sane
    job.setJarByClass(getClass());
    final Class<? extends Mapper> mapper = MongoConfigUtil.getMapper(conf);

    if (LOG.isDebugEnabled()) {
        LOG.debug("Mapper Class: " + mapper);
        LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI));
    }
    job.setMapperClass(mapper);
    Class<? extends Reducer> combiner = MongoConfigUtil.getCombiner(conf);
    if (combiner != null) {
        job.setCombinerClass(combiner);
    }
    job.setReducerClass(MongoConfigUtil.getReducer(conf));

    job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf));
    job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf));
    job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf));
    job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf));
    Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf);
    Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf);

    if (mapOutputKeyClass != null) {
        job.setMapOutputKeyClass(mapOutputKeyClass);
    }
    if (mapOutputValueClass != null) {
        job.setMapOutputValueClass(mapOutputValueClass);
    }

    /**
     * Determines if the job will run verbosely e.g. print debug output
     * Only works with foreground jobs
     */
    final boolean verbose = MongoConfigUtil.isJobVerbose(conf);
    /**
     * Run job in foreground aka wait for completion or background?
     */
    final boolean background = MongoConfigUtil.isJobBackground(conf);
    try {
        if (background) {
            LOG.info("Setting up and running MapReduce job in background.");
            job.submit();
            return 0;
        } else {
            LOG.info("Setting up and running MapReduce job in foreground, will wait for results.  {Verbose? "
                    + verbose + "}");
            return job.waitForCompletion(true) ? 0 : 1;
        }
    } catch (final Exception e) {
        LOG.error("Exception while executing job... ", e);
        return 1;
    }
}

From source file:cosmos.mapred.MediawikiIngestJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (1 != args.length) {
        System.err.println("Usage: input.xml,input.xml,input.xml...");
        return 1;
    }/*from  w  w w.j av a  2 s. c o m*/

    String inputFiles = args[0];

    Configuration conf = getConf();
    System.out.println("path " + conf.get("fs.default.name"));
    conf.addResource(new Path("/opt/hadoop/conf/hdfs-site.xml"));
    conf.addResource(new Path("/opt/hadoop/conf/core-site.xml"));

    conf.addResource(new Path("/opt/hadoop/conf/mapred-site.xml"));

    System.out.println("path " + conf.get("fs.default.name"));
    //System.exit(1);
    Job job = new Job(conf, "Mediawiki Ingest");

    job.setJarByClass(MediawikiIngestJob.class);

    String tablename = "sortswiki";
    String zookeepers = "localhost:2181";
    String instanceName = "accumulo";
    String user = "root";
    PasswordToken passwd = new PasswordToken("secret");

    FileInputFormat.setInputPaths(job, inputFiles);

    job.setMapperClass(MediawikiMapper.class);
    job.setNumReduceTasks(0);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Mutation.class);
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    BatchWriterConfig bwConfig = new BatchWriterConfig();

    job.setInputFormatClass(MediawikiInputFormat.class);
    AccumuloOutputFormat.setZooKeeperInstance(job, instanceName, zookeepers);
    AccumuloOutputFormat.setConnectorInfo(job, user, passwd);
    AccumuloOutputFormat.setBatchWriterOptions(job, bwConfig);
    AccumuloOutputFormat.setCreateTables(job, true);
    AccumuloOutputFormat.setDefaultTableName(job, tablename);

    return job.waitForCompletion(true) ? 0 : 1;
}