Example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass.

Prototype

public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the OutputFormat for the job.

Usage

From source file:com.wibidata.wibidota.dotaloader.DotaValuesCounter.java

License:Apache License

public final int run(final String[] args) throws Exception {
    Job job = new Job(super.getConf(), "Dota Value Counter");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(Map.class);
    job.setCombinerClass(Add.class);
    job.setReducerClass(Add.class);

    job.setJarByClass(DotaValuesCounter.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (job.waitForCompletion(true)) {
        return 0;
    } else {//from w  w  w .j  a  v  a  2s .c o  m
        return -1;
    }
}

From source file:com.wibidata.wibidota.DotaMaxAccountId.java

License:Apache License

public final int run(final String[] args) throws Exception {
    Job job = new Job(super.getConf(), "Dota Max Builder");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(DotaMaxAccountId.Map.class);
    job.setCombinerClass(DotaMaxAccountId.TakeMax.class);
    job.setReducerClass(DotaMaxAccountId.TakeMax.class);

    job.setJarByClass(DotaMaxAccountId.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (job.waitForCompletion(true)) {
        return 0;
    } else {//from   w w  w .j a v  a  2s  .co m
        return -1;
    }
}

From source file:com.wipro.ats.bdre.datagen.mr.Driver.java

License:Apache License

/**
 * @param args the cli arguments/*from   ww w .  j av a2  s.c o  m*/
 */
@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = getConf();
    GetGeneralConfig generalConfig = new GetGeneralConfig();
    GeneralConfig gc = generalConfig.byConigGroupAndKey("imconfig", "common.default-fs-name");
    conf.set("fs.defaultFS", gc.getDefaultVal());

    String processId = args[0];
    Path outputDir = new Path(ResolvePath.replaceVars(args[1]));

    Properties dataProps = Config.getDataProperties(processId);
    Properties tableProps = Config.getTableProperties(processId);

    TableUtil tableUtil = new TableUtil();
    Table table = tableUtil.formTableFromConfig(processId);
    FileSystem fs = FileSystem.get(conf);
    LOGGER.info("Default FS =" + conf.get("fs.defaultFS"));
    //set in the conf for mappers to use
    conf.set(Config.SEPARATOR_KEY, tableProps.getProperty("separator"));
    conf.set(Config.PID_KEY, processId);
    conf.setLong(Config.NUM_ROWS_KEY, Long.parseLong(dataProps.getProperty("numRows")));
    conf.setInt(Config.NUM_SPLITS_KEY, Integer.parseInt(dataProps.getProperty("numSplits")));

    Job job = Job.getInstance(conf);
    Path mrOutputPath = new Path(outputDir.toString() + "/MROUT/" + table.getTableName());

    FileOutputFormat.setOutputPath(job, mrOutputPath);
    job.setJobName("Datagen-" + table.getTableName());
    job.setJarByClass(Driver.class);
    job.setMapperClass(RecordGenMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(RangeInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.waitForCompletion(true);

    //merge and create a single file

    Path srcDir = mrOutputPath;
    Path destFile = new Path(outputDir.toString() + "/" + table.getTableName());
    FileUtil.copyMerge(fs, srcDir, fs, destFile, true, conf, "");

    //Return file info oozie params
    RegisterFileInfo registerFileInfo = new RegisterFileInfo();
    registerFileInfo.setBatchId(null);
    registerFileInfo.setCreationTs(new Timestamp(new Date().getTime()));
    registerFileInfo.setFileHash("0");
    registerFileInfo.setFileSize(0L);
    registerFileInfo.setPath(destFile.toString());
    registerFileInfo.setSubProcessId(Integer.parseInt(processId));
    OozieUtil oozieUtil = new OozieUtil();
    oozieUtil.persistBeanData(registerFileInfo, false);
    return 0;
}

From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java

License:Apache License

@Override
public int run(String[] strings) throws Exception {
    Configuration conf = getConf();
    String dir = conf.get(LindenJobConfig.INPUT_DIR, null);
    logger.info("input dir:" + dir);
    Path inputPath = new Path(StringUtils.unEscapeString(dir));
    Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR));
    String indexPath = conf.get(LindenJobConfig.INDEX_PATH);

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);//from  w  w w  . j  a va2 s  . c  o m
    }
    if (fs.exists(new Path(indexPath))) {
        fs.delete(new Path(indexPath), true);
    }

    int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1);
    Shard[] shards = createShards(indexPath, numShards);

    Shard.setIndexShards(conf, shards);

    //empty trash;
    (new Trash(conf)).expunge();

    Job job = Job.getInstance(conf, "linden-hadoop-indexing");
    job.setJarByClass(LindenJob.class);
    job.setMapperClass(LindenMapper.class);
    job.setCombinerClass(LindenCombiner.class);
    job.setReducerClass(LindenReducer.class);
    job.setMapOutputKeyClass(Shard.class);
    job.setMapOutputValueClass(IntermediateForm.class);
    job.setOutputKeyClass(Shard.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(IndexUpdateOutputFormat.class);
    job.setReduceSpeculativeExecution(false);
    job.setNumReduceTasks(numShards);

    String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL);
    if (lindenSchemaFile == null) {
        throw new IOException("no schema file is found");
    }
    logger.info("Adding schema file: " + lindenSchemaFile);
    job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema"));
    String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL);
    if (lindenPropertiesFile == null) {
        throw new IOException("no linden properties file is found");
    }
    logger.info("Adding linden properties file: " + lindenPropertiesFile);
    job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties"));

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    Path[] inputs = FileInputFormat.getInputPaths(job);
    StringBuilder buffer = new StringBuilder(inputs[0].toString());
    for (int i = 1; i < inputs.length; i++) {
        buffer.append(",");
        buffer.append(inputs[i].toString());
    }
    logger.info("mapreduce.input.dir = " + buffer.toString());
    logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString());
    logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks());
    logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS));
    logger.info("mapreduce.input.format.class = " + job.getInputFormatClass());
    logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass());
    logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR));

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed");
    }
    return 0;
}

From source file:com.xiaoxiaomo.mr.utils.kafka.HadoopJob.java

License:Apache License

public int run(String[] args) throws Exception {
    CommandLineParser parser = new PosixParser();
    Options options = buildOptions();/*  www . java2 s . c o m*/
    CommandLine cmd = parser.parse(options, args);

    if (cmd.hasOption("h") || cmd.getArgs().length == 0) {
        printHelpAndExit(options);
    }

    String hdfsPath = cmd.getArgs()[0];
    Configuration conf = getConf();
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);

    if (cmd.hasOption("topics")) {
        LOG.info("Using topics: " + cmd.getOptionValue("topics"));
        KafkaInputFormat.configureKafkaTopics(conf, cmd.getOptionValue("topics"));
    } else {
        printHelpAndExit(options);
    }

    KafkaInputFormat.configureZkConnection(conf, cmd.getOptionValue("zk-connect", "localhost:2181"));
    if (cmd.hasOption("consumer-group")) {
        CheckpointManager.configureUseZooKeeper(conf,
                cmd.getOptionValue("consumer-group", "dev-hadoop-loader"));
    }

    if (cmd.getOptionValue("autooffset-reset") != null) {
        KafkaInputFormat.configureAutoOffsetReset(conf, cmd.getOptionValue("autooffset-reset"));
    }

    JobConf jobConf = new JobConf(conf);
    if (cmd.hasOption("remote")) {
        String ip = cmd.getOptionValue("remote");
        LOG.info("Default file system: hdfs://" + ip + ":8020/");
        jobConf.set("fs.defaultFS", "hdfs://" + ip + ":8020/");
        LOG.info("Remote jobtracker: " + ip + ":8021");
        jobConf.set("mapred.job.tracker", ip + ":8021");
    }

    Path jarTarget = new Path(
            getClass().getProtectionDomain().getCodeSource().getLocation() + "../kafka-hadoop-loader.jar");

    if (new File(jarTarget.toUri()).exists()) {
        // running from IDE/ as maven
        jobConf.setJar(jarTarget.toUri().getPath());
        LOG.info("Using target jar: " + jarTarget.toString());
    } else {
        // running from jar remotely or locally
        jobConf.setJarByClass(getClass());
        LOG.info("Using parent jar: " + jobConf.getJar());
    }

    Job job = Job.getInstance(jobConf, "kafka.hadoop.loader");

    job.setInputFormatClass(KafkaInputFormat.class);
    job.setMapperClass(HadoopJobMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(MultiOutputFormat.class);
    job.setNumReduceTasks(0);

    MultiOutputFormat.setOutputPath(job, new Path(hdfsPath));
    MultiOutputFormat.setCompressOutput(job, cmd.getOptionValue("compress-output", "on").equals("on"));

    LOG.info("Output hdfs location: {}", hdfsPath);
    LOG.info("Output hdfs compression: {}", MultiOutputFormat.getCompressOutput(job));

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.xoriant.kafkaProducer.MyConsumer.java

License:Apache License

public static void main(String[] args) throws IOException {
    // System.setProperty("spark.executor.memory", "8g");
    System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    // Create the context with a 1 second batch size
    SparkConf sparkConf = new SparkConf();
    // final Configuration config = new Configuration();
    Configuration hadoopConfig = new Configuration();
    hadoopConfig.set("mapreduce.output.textoutputformat.separator", ",");
    sparkConf.setMaster("local[2]");
    sparkConf.setAppName("Insurance");
    JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);

    JavaStreamingContext javaStreamingContext = new JavaStreamingContext(javaSparkContext, new Duration(500));

    int numThreads = Integer.parseInt(args[3]);
    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    String[] topics = args[2].split(",");
    for (String topic : topics) {
        topicMap.put(topic, numThreads);
    }// w  ww.ja v a2 s .  c om

    // 3. create connection with HBase
    Configuration config = null;

    try {
        config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum", "192.168.1.114");
        config.set("hbase.zookeeper.property.clientPort", "2181");

        // config.set("mapreduce.job.output.key.class",
        // Text.class.getName());
        // config.set("mapreduce.job.output.value.class",
        // IntWritable.class.getName());
        // config.set("mapreduce.outputformat.class" ,
        // TableOutputFormat.class.getName());
        // config.set("hbase.master", "127.0.0.1:60000");
        HBaseAdmin.checkHBaseAvailable(config);

        System.out.println("HBase is running!");
    } catch (MasterNotRunningException e) {
        System.out.println("HBase is not running!");
        System.exit(1);
    } catch (Exception ce) {
        System.out.println("here.....");
        ce.printStackTrace();
    }

    // config.set(TableInputFormat.INPUT_TABLE, rawTableName);

    // 4. new Hadoop API configuration
    final Job newAPIJobConfigurationState = Job.getInstance(config);
    newAPIJobConfigurationState.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, stateTable);
    newAPIJobConfigurationState.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class);

    final Job newAPIJobConfigurationUser = Job.getInstance(config);
    newAPIJobConfigurationUser.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "user_total_stream");
    newAPIJobConfigurationUser.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class);

    final Job paymentHistoryConfig = Job.getInstance(config);
    paymentHistoryConfig.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "payment_history_stream");
    paymentHistoryConfig.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class);
    /*
     * Set<String> topics = new HashSet<String>(); topics.add("test");
     * 
     * 
     * Map<String, String> kafkaParams = new HashMap<String, String>();
     * kafkaParams.put("metadata.broker.list", "10.20.0.199:9092");
     */
    /*
     * JavaPairInputDStream<String, String> stream = KafkaUtils
     * .createDirectStream(javaStreamingContext, String.class, String.class,
     * StringDecoder.class, StringDecoder.class, kafkaParams, topics);
     */

    JavaPairReceiverInputDStream<String, String> stream = KafkaUtils.createStream(javaStreamingContext, args[0],
            args[1], topicMap);

    System.out.println(
            "Got my DStream! connecting to zookeeper " + args[0] + " group " + args[1] + " topics" + topicMap);

    stream.count().print();

    JavaDStream<Tuple11<String, String, String, String, String, String, String, String, String, String, String>> records = stream
            .map(new Function<Tuple2<String, String>, Tuple11<String, String, String, String, String, String, String, String, String, String, String>>() {

                private static final long serialVersionUID = 1L;

                public Tuple11<String, String, String, String, String, String, String, String, String, String, String> call(
                        Tuple2<String, String> defaultKeyAndRecords) throws Exception {

                    String[] fields = defaultKeyAndRecords._2().split(",");

                    return new Tuple11<String, String, String, String, String, String, String, String, String, String, String>(
                            fields[0], fields[1], fields[2], fields[3], fields[4], fields[5], fields[6],
                            fields[7], fields[8], fields[9], fields[10]);
                }
            });

    records.foreachRDD(
            new Function<JavaRDD<Tuple11<String, String, String, String, String, String, String, String, String, String, String>>, Void>() {
                private static final long serialVersionUID = -3333697808496161495L;

                public Void call(
                        JavaRDD<Tuple11<String, String, String, String, String, String, String, String, String, String, String>> rdd)
                        throws Exception {
                    saveToHBasePaymentHistory(rdd, paymentHistoryConfig.getConfiguration());
                    return null;
                }
            });

    JavaPairDStream<String, String> window = records.mapToPair(
            new PairFunction<Tuple11<String, String, String, String, String, String, String, String, String, String, String>, String, String>() {

                private static final long serialVersionUID = -8849699432349098738L;

                public Tuple2<String, String> call(
                        Tuple11<String, String, String, String, String, String, String, String, String, String, String> arg0)
                        throws Exception {

                    String str = arg0._2() + "," + arg0._3() + "," + arg0._4() + "," + arg0._5() + ","
                            + arg0._6() + "," + arg0._7() + "," + arg0._8() + "," + arg0._9() + "," + arg0._10()
                            + "," + arg0._11();

                    return new Tuple2<String, String>(arg0._1(), str);
                }
            }).window(new Duration(60000), new Duration(60000));

    window.saveAsNewAPIHadoopFiles("hdfs://192.168.1.114/user/hadoop/StreamingData/Insurancedata", "",
            Text.class, Text.class, TextOutputFormat.class, hadoopConfig);

    JavaPairDStream<String, Integer> recordsMapState = records.mapToPair(
            new PairFunction<Tuple11<String, String, String, String, String, String, String, String, String, String, String>, String, Integer>() {
                private static final long serialVersionUID = 1L;

                public Tuple2<String, Integer> call(
                        Tuple11<String, String, String, String, String, String, String, String, String, String, String> arg0)
                        throws Exception {
                    String key = arg0._10();
                    Integer value = new Integer(arg0._7());

                    return new Tuple2<String, Integer>(key, value);
                }

            });

    JavaPairDStream<String, Integer> recordsMapUser = records.mapToPair(
            new PairFunction<Tuple11<String, String, String, String, String, String, String, String, String, String, String>, String, Integer>() {
                private static final long serialVersionUID = 1L;

                public Tuple2<String, Integer> call(
                        Tuple11<String, String, String, String, String, String, String, String, String, String, String> arg0)
                        throws Exception {
                    String key = arg0._1();
                    Integer value = new Integer(arg0._7());

                    return new Tuple2<String, Integer>(key, value);
                }

            });

    JavaPairDStream<String, Integer> reduceByKeyAndWindowState = recordsMapState
            .reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {
                private static final long serialVersionUID = 197675516004789269L;

                public Integer call(Integer val1, Integer val2) throws Exception {
                    return val1 + val2;

                }
            }, new Duration(86400000), new Duration(10000));

    JavaPairDStream<String, Integer> reduceByKeyAndWindowUser = recordsMapUser
            .reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {
                private static final long serialVersionUID = 197675516004789269L;

                public Integer call(Integer val1, Integer val2) throws Exception {
                    return val1 + val2;

                }
            }, new Duration(86400000), new Duration(60000));

    // reduce.count();
    reduceByKeyAndWindowState.print();

    reduceByKeyAndWindowState.foreachRDD(new Function<JavaPairRDD<String, Integer>, Void>() {
        private static final long serialVersionUID = 8534726505385048702L;

        public Void call(JavaPairRDD<String, Integer> rdd) throws Exception {
            saveToHBase(rdd, newAPIJobConfigurationState.getConfiguration());
            return null;
        }
    });

    reduceByKeyAndWindowUser.foreachRDD(new Function<JavaPairRDD<String, Integer>, Void>() {
        private static final long serialVersionUID = 8534726505385048702L;

        public Void call(JavaPairRDD<String, Integer> rdd) throws Exception {
            saveToHBase(rdd, newAPIJobConfigurationUser.getConfiguration());
            return null;
        }
    });

    javaStreamingContext.start();
    javaStreamingContext.awaitTermination();
}

From source file:com.xyz.reccommendation.driver.SKU2SKUCount.java

License:Apache License

public static void main(String[] args) throws Exception {

    final Configuration conf = new Configuration();

    String envt = null;//from   w  w w.j  a  v  a2 s .  co  m

    if (args.length > 0) {
        envt = args[0];
    } else {
        envt = "dev";
    }

    Properties prop = new Properties();

    try {
        // load a properties file from class path, inside static method
        prop.load(SKU2SKUCount.class.getClassLoader().getResourceAsStream("config-" + envt + ".properties"));

    } catch (IOException ex) {
        ex.printStackTrace();
        System.exit(1);
    }

    MongoConfigUtil.setOutputURI(conf, "mongodb://" + prop.getProperty("mongodb.ip") + "/"
            + prop.getProperty("mongodb.dbname") + ".out_stat_custom");

    log.debug("MongoDB URL : mongodb://" + prop.getProperty("mongodb.ip") + "/"
            + prop.getProperty("mongodb.dbname") + "." + ".out_stat_custom");

    log.debug("Conf: " + conf);

    MongoConfigUtil.setCreateInputSplits(conf, false);
    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    final Job job = new Job(conf,
            "Count the sku to sku mapping from pview data on hdfs in \"inputPview\" path.");

    job.setJarByClass(SKU2SKUCount.class);

    job.setMapperClass(TokenizerMapper.class);

    // job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BSONWritable.class);

    job.setInputFormatClass(KeyValueTextInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path("inputPview"));

    System.exit(job.waitForCompletion(true) ? 0 : 1);

}

From source file:com.yahoo.druid.hadoop.DruidInputFormatTest.java

License:Apache License

@Test
public void testSampleMRJob() throws Exception {
    Job job = Job.getInstance(new Configuration(), "Druid-Loader-Sample-Test-Job");

    job.getConfiguration().set("mapreduce.job.acl-view-job", "*");
    job.getConfiguration().set("mapreduce.map.java.opts", "-Duser.timezone=UTC");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_OVERLORD_HOSTPORT, "localhost:" + overlordTestPort);
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_SCHEMA,
            "{" + "\"dataSource\":\"testDataSource\","
                    + "\"interval\":\"1970-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z\","
                    + "\"granularity\":\"NONE\"," + "\"dimensions\":[\"host\"],"
                    + "\"metrics\":[\"visited_sum\",\"unique_hosts\"]" + "}");

    job.setMapperClass(SampleMapper.class);
    job.setNumReduceTasks(0);/*www .  java 2 s . co  m*/

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);

    job.setInputFormatClass(DruidInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    String outputPath = tempFolder.newFolder() + "/out";
    TextOutputFormat.setOutputPath(job, new Path(outputPath));

    Assert.assertTrue(job.waitForCompletion(true));

    //verify that the SampleMapper actually ran and verified the data
    Assert.assertTrue(FileUtils.readFileToString(new File(outputPath + "/part-m-00000")).startsWith("SUCCESS"));
}

From source file:com.yahoo.druid.hadoop.example.SamplePrintMRJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // When implementing tool
    Configuration conf = this.getConf();

    // Create job
    Job job = new Job(conf, "Druid-Loader-Sample-Job");
    job.setJarByClass(SamplePrintMRJob.class);
    //    job.setJobName("Druid-Loader-Sample-Job");

    job.getConfiguration().set("mapreduce.job.acl-view-job", "*");
    job.getConfiguration().set("mapreduce.job.queuename", "default");
    job.getConfiguration().set("mapreduce.map.java.opts", "-Duser.timezone=UTC");
    //job.getConfiguration().set("mapreduce.map.memory.mb", "1024");

    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_STORAGE_STORAGE_DIR, "/tmp/druid/storage");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_OVERLORD_HOSTPORT, "localhost:8080");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_DATASOURCE, "wikipedia");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_INTERVAL,
            "2009-01-01T00:00:00.000/2050-01-01T00:00:00.000");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_SCHEMA_FILE, "/tmp/druid/schema/druid_fun_mr.json");

    job.setMapperClass(DruidPrintMapper.class);
    job.setNumReduceTasks(0);//from   ww  w .j  av a  2 s .  co m

    job.setOutputKeyClass(DateTime.class);
    job.setOutputValueClass(Map.class);

    job.setInputFormatClass(DruidInputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);

    System.out.println("Starting Druid Loader Sample Job.....");
    return job.waitForCompletion(true) ? 0 : 1;
    //System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.yahoo.glimmer.indexing.generator.TripleIndexGenerator.java

License:Open Source License

public int run(String[] args) throws Exception {
    SimpleJSAP jsap = new SimpleJSAP(TripleIndexGenerator.class.getName(),
            "Generates a keyword index from RDF data.",
            new Parameter[] {
                    new Switch(NO_CONTEXTS_ARG, 'C', "withoutContexts",
                            "Don't process the contexts for each tuple."),
                    new FlaggedOption(METHOD_ARG, JSAP.STRING_PARSER, "horizontal", JSAP.REQUIRED, 'm',
                            METHOD_ARG, "horizontal or vertical."),
                    new FlaggedOption(PREDICATES_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
                            'p', PREDICATES_ARG, "Subset of the properties to be indexed."),
                    new FlaggedOption(RESOURCE_PREFIX_ARG, JSAP.STRING_PARSER, "@", JSAP.NOT_REQUIRED, 'r',
                            RESOURCE_PREFIX_ARG,
                            "Prefix to add to object resource hash values when indexing. Stops queries for numbers matching resource hash values. Default is '@'"),

                    new UnflaggedOption("input", JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the input data."),
                    new UnflaggedOption(NUMBER_OF_DOCS_ARG, JSAP.LONG_PARSER, JSAP.REQUIRED,
                            "Number of documents to index"),
                    new UnflaggedOption("output", JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the output."),
                    new UnflaggedOption(RESOURCES_HASH_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location of the resources hash file."), });

    JSAPResult jsapResult = jsap.parse(args);

    // check whether the command line was valid, and if it wasn't,
    // display usage information and exit.
    if (!jsapResult.success()) {
        System.err.println();//from ww w  .j a  v a2  s .  co  m
        System.err.println("Usage: java " + TripleIndexGenerator.class.getName());
        System.err.println("                " + jsap.getUsage());
        System.err.println();
        System.exit(1);
    }

    Job job = Job.getInstance(getConf());
    job.setJarByClass(TripleIndexGenerator.class);
    job.setJobName("TripleIndexGenerator" + System.currentTimeMillis());

    FileInputFormat.setInputPaths(job, new Path(jsapResult.getString("input")));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(DocumentMapper.class);
    job.setMapOutputKeyClass(TermKey.class);
    job.setMapOutputValueClass(TermValue.class);

    job.setPartitionerClass(TermKey.FirstPartitioner.class);
    job.setGroupingComparatorClass(TermKey.FirstGroupingComparator.class);

    job.setReducerClass(TermReduce.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IndexRecordWriterValue.class);
    job.setOutputFormatClass(IndexRecordWriter.OutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(jsapResult.getString("output")));

    Configuration conf = job.getConfiguration();

    conf.setClass("mapred.output.key.comparator.class", TermKey.Comparator.class, WritableComparator.class);
    conf.set("mapreduce.user.classpath.first", "true");

    long numDocs = jsapResult.getLong(NUMBER_OF_DOCS_ARG);
    conf.setLong(NUMBER_OF_DOCUMENTS, numDocs);
    // Set this in a attempt to get around the 2GB of ram task limit on our cluster.
    // Setting this in the hope of fixing Direct buffer memory errors
    conf.setInt(INDEX_WRITER_CACHE_SIZE, 1024 * 1024);

    conf.set(OUTPUT_DIR, jsapResult.getString("output"));

    boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false);
    if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_HORIZONTAL)) {
        HorizontalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG),
                jsapResult.getString(RESOURCE_PREFIX_ARG));
    } else if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_VERTICAL)) {
        if (!jsapResult.contains(PREDICATES_ARG)) {
            throw new IllegalArgumentException("When '" + METHOD_ARG + "' is '" + METHOD_ARG_VALUE_VERTICAL
                    + "' you have to give a predicates file too.");
        }
        VerticalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG),
                jsapResult.getString(RESOURCE_PREFIX_ARG), jsapResult.getString(PREDICATES_ARG));
    } else {
        throw new IllegalArgumentException(METHOD_ARG + " should be '" + METHOD_ARG_VALUE_HORIZONTAL + "' or '"
                + METHOD_ARG_VALUE_VERTICAL + "'");
    }

    conf.setInt("mapreduce.input.linerecordreader.line.maxlength", 1024 * 1024);

    boolean success = job.waitForCompletion(true);

    return success ? 0 : 1;
}