Example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass.

Prototype

public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException

Source Link

Document

Set the OutputFormat for the job.

Usage

From source file:com.wibidata.wibidota.dotaloader.DotaValuesCounter.java

License:Apache License

public final int run(final String[] args) throws Exception {
    Job job = new Job(super.getConf(), "Dota Value Counter");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(Map.class);
    job.setCombinerClass(Add.class);
    job.setReducerClass(Add.class);

    job.setJarByClass(DotaValuesCounter.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (job.waitForCompletion(true)) {
        return 0;
    } else {//from w  w  w .j  a  v  a  2s .c o  m
        return -1;
    }
}

From source file:com.wibidata.wibidota.DotaMaxAccountId.java

License:Apache License

public final int run(final String[] args) throws Exception {
    Job job = new Job(super.getConf(), "Dota Max Builder");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(DotaMaxAccountId.Map.class);
    job.setCombinerClass(DotaMaxAccountId.TakeMax.class);
    job.setReducerClass(DotaMaxAccountId.TakeMax.class);

    job.setJarByClass(DotaMaxAccountId.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (job.waitForCompletion(true)) {
        return 0;
    } else {//from   w w  w .j a v  a  2s  .co m
        return -1;
    }
}

From source file:com.wipro.ats.bdre.datagen.mr.Driver.java

License:Apache License

/**
 * @param args the cli arguments/*from   ww w .  j av a2  s.c o  m*/
 */
@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Configuration conf = getConf();
    GetGeneralConfig generalConfig = new GetGeneralConfig();
    GeneralConfig gc = generalConfig.byConigGroupAndKey("imconfig", "common.default-fs-name");
    conf.set("fs.defaultFS", gc.getDefaultVal());

    String processId = args[0];
    Path outputDir = new Path(ResolvePath.replaceVars(args[1]));

    Properties dataProps = Config.getDataProperties(processId);
    Properties tableProps = Config.getTableProperties(processId);

    TableUtil tableUtil = new TableUtil();
    Table table = tableUtil.formTableFromConfig(processId);
    FileSystem fs = FileSystem.get(conf);
    LOGGER.info("Default FS =" + conf.get("fs.defaultFS"));
    //set in the conf for mappers to use
    conf.set(Config.SEPARATOR_KEY, tableProps.getProperty("separator"));
    conf.set(Config.PID_KEY, processId);
    conf.setLong(Config.NUM_ROWS_KEY, Long.parseLong(dataProps.getProperty("numRows")));
    conf.setInt(Config.NUM_SPLITS_KEY, Integer.parseInt(dataProps.getProperty("numSplits")));

    Job job = Job.getInstance(conf);
    Path mrOutputPath = new Path(outputDir.toString() + "/MROUT/" + table.getTableName());

    FileOutputFormat.setOutputPath(job, mrOutputPath);
    job.setJobName("Datagen-" + table.getTableName());
    job.setJarByClass(Driver.class);
    job.setMapperClass(RecordGenMapper.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(RangeInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.waitForCompletion(true);

    //merge and create a single file

    Path srcDir = mrOutputPath;
    Path destFile = new Path(outputDir.toString() + "/" + table.getTableName());
    FileUtil.copyMerge(fs, srcDir, fs, destFile, true, conf, "");

    //Return file info oozie params
    RegisterFileInfo registerFileInfo = new RegisterFileInfo();
    registerFileInfo.setBatchId(null);
    registerFileInfo.setCreationTs(new Timestamp(new Date().getTime()));
    registerFileInfo.setFileHash("0");
    registerFileInfo.setFileSize(0L);
    registerFileInfo.setPath(destFile.toString());
    registerFileInfo.setSubProcessId(Integer.parseInt(processId));
    OozieUtil oozieUtil = new OozieUtil();
    oozieUtil.persistBeanData(registerFileInfo, false);
    return 0;
}

From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java

License:Apache License

@Override
public int run(String[] strings) throws Exception {
    Configuration conf = getConf();
    String dir = conf.get(LindenJobConfig.INPUT_DIR, null);
    logger.info("input dir:" + dir);
    Path inputPath = new Path(StringUtils.unEscapeString(dir));
    Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR));
    String indexPath = conf.get(LindenJobConfig.INDEX_PATH);

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);//from  w  w w  . j  a va2 s  . c  o m
    }
    if (fs.exists(new Path(indexPath))) {
        fs.delete(new Path(indexPath), true);
    }

    int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1);
    Shard[] shards = createShards(indexPath, numShards);

    Shard.setIndexShards(conf, shards);

    //empty trash;
    (new Trash(conf)).expunge();

    Job job = Job.getInstance(conf, "linden-hadoop-indexing");
    job.setJarByClass(LindenJob.class);
    job.setMapperClass(LindenMapper.class);
    job.setCombinerClass(LindenCombiner.class);
    job.setReducerClass(LindenReducer.class);
    job.setMapOutputKeyClass(Shard.class);
    job.setMapOutputValueClass(IntermediateForm.class);
    job.setOutputKeyClass(Shard.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(IndexUpdateOutputFormat.class);
    job.setReduceSpeculativeExecution(false);
    job.setNumReduceTasks(numShards);

    String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL);
    if (lindenSchemaFile == null) {
        throw new IOException("no schema file is found");
    }
    logger.info("Adding schema file: " + lindenSchemaFile);
    job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema"));
    String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL);
    if (lindenPropertiesFile == null) {
        throw new IOException("no linden properties file is found");
    }
    logger.info("Adding linden properties file: " + lindenPropertiesFile);
    job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties"));

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    Path[] inputs = FileInputFormat.getInputPaths(job);
    StringBuilder buffer = new StringBuilder(inputs[0].toString());
    for (int i = 1; i < inputs.length; i++) {
        buffer.append(",");
        buffer.append(inputs[i].toString());
    }
    logger.info("mapreduce.input.dir = " + buffer.toString());
    logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString());
    logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks());
    logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS));
    logger.info("mapreduce.input.format.class = " + job.getInputFormatClass());
    logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass());
    logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR));

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed");
    }
    return 0;
}

From source file:com.xiaoxiaomo.mr.utils.kafka.HadoopJob.java

License:Apache License

public int run(String[] args) throws Exception {
    CommandLineParser parser = new PosixParser();
    Options options = buildOptions();/*  www . java2 s . c o m*/
    CommandLine cmd = parser.parse(options, args);

    if (cmd.hasOption("h") || cmd.getArgs().length == 0) {
        printHelpAndExit(options);
    }

    String hdfsPath = cmd.getArgs()[0];
    Configuration conf = getConf();
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);

    if (cmd.hasOption("topics")) {
        LOG.info("Using topics: " + cmd.getOptionValue("topics"));
        KafkaInputFormat.configureKafkaTopics(conf, cmd.getOptionValue("topics"));
    } else {
        printHelpAndExit(options);
    }

    KafkaInputFormat.configureZkConnection(conf, cmd.getOptionValue("zk-connect", "localhost:2181"));
    if (cmd.hasOption("consumer-group")) {
        CheckpointManager.configureUseZooKeeper(conf,
                cmd.getOptionValue("consumer-group", "dev-hadoop-loader"));
    }

    if (cmd.getOptionValue("autooffset-reset") != null) {
        KafkaInputFormat.configureAutoOffsetReset(conf, cmd.getOptionValue("autooffset-reset"));
    }

    JobConf jobConf = new JobConf(conf);
    if (cmd.hasOption("remote")) {
        String ip = cmd.getOptionValue("remote");
        LOG.info("Default file system: hdfs://" + ip + ":8020/");
        jobConf.set("fs.defaultFS", "hdfs://" + ip + ":8020/");
        LOG.info("Remote jobtracker: " + ip + ":8021");
        jobConf.set("mapred.job.tracker", ip + ":8021");
    }

    Path jarTarget = new Path(
            getClass().getProtectionDomain().getCodeSource().getLocation() + "../kafka-hadoop-loader.jar");

    if (new File(jarTarget.toUri()).exists()) {
        // running from IDE/ as maven
        jobConf.setJar(jarTarget.toUri().getPath());
        LOG.info("Using target jar: " + jarTarget.toString());
    } else {
        // running from jar remotely or locally
        jobConf.setJarByClass(getClass());
        LOG.info("Using parent jar: " + jobConf.getJar());
    }

    Job job = Job.getInstance(jobConf, "kafka.hadoop.loader");

    job.setInputFormatClass(KafkaInputFormat.class);
    job.setMapperClass(HadoopJobMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(MultiOutputFormat.class);
    job.setNumReduceTasks(0);

    MultiOutputFormat.setOutputPath(job, new Path(hdfsPath));
    MultiOutputFormat.setCompressOutput(job, cmd.getOptionValue("compress-output", "on").equals("on"));

    LOG.info("Output hdfs location: {}", hdfsPath);
    LOG.info("Output hdfs compression: {}", MultiOutputFormat.getCompressOutput(job));

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.xoriant.kafkaProducer.MyConsumer.java

License:Apache License

public static void main(String[] args) throws IOException {
    // System.setProperty("spark.executor.memory", "8g");
    System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    // Create the context with a 1 second batch size
    SparkConf sparkConf = new SparkConf();
    // final Configuration config = new Configuration();
    Configuration hadoopConfig = new Configuration();
    hadoopConfig.set("mapreduce.output.textoutputformat.separator", ",");
    sparkConf.setMaster("local[2]");
    sparkConf.setAppName("Insurance");
    JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);

    JavaStreamingContext javaStreamingContext = new JavaStreamingContext(javaSparkContext, new Duration(500));

    int numThreads = Integer.parseInt(args[3]);
    Map<String, Integer> topicMap = new HashMap<String, Integer>();
    String[] topics = args[2].split(",");
    for (String topic : topics) {
        topicMap.put(topic, numThreads);
    }// w  ww.ja v a2 s .  c om

    // 3. create connection with HBase
    Configuration config = null;

    try {
        config = HBaseConfiguration.create();
        config.set("hbase.zookeeper.quorum", "192.168.1.114");
        config.set("hbase.zookeeper.property.clientPort", "2181");

        // config.set("mapreduce.job.output.key.class",
        // Text.class.getName());
        // config.set("mapreduce.job.output.value.class",
        // IntWritable.class.getName());
        // config.set("mapreduce.outputformat.class" ,
        // TableOutputFormat.class.getName());
        // config.set("hbase.master", "127.0.0.1:60000");
        HBaseAdmin.checkHBaseAvailable(config);

        System.out.println("HBase is running!");
    } catch (MasterNotRunningException e) {
        System.out.println("HBase is not running!");
        System.exit(1);
    } catch (Exception ce) {
        System.out.println("here.....");
        ce.printStackTrace();
    }

    // config.set(TableInputFormat.INPUT_TABLE, rawTableName);

    // 4. new Hadoop API configuration
    final Job newAPIJobConfigurationState = Job.getInstance(config);
    newAPIJobConfigurationState.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, stateTable);
    newAPIJobConfigurationState.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class);

    final Job newAPIJobConfigurationUser = Job.getInstance(config);
    newAPIJobConfigurationUser.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "user_total_stream");
    newAPIJobConfigurationUser.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class);

    final Job paymentHistoryConfig = Job.getInstance(config);
    paymentHistoryConfig.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "payment_history_stream");
    paymentHistoryConfig.setOutputFormatClass(org.apache.hadoop.hbase.mapreduce.TableOutputFormat.class);
    /*
     * Set<String> topics = new HashSet<String>(); topics.add("test");
     * 
     * 
     * Map<String, String> kafkaParams = new HashMap<String, String>();
     * kafkaParams.put("metadata.broker.list", "10.20.0.199:9092");
     */
    /*
     * JavaPairInputDStream<String, String> stream = KafkaUtils
     * .createDirectStream(javaStreamingContext, String.class, String.class,
     * StringDecoder.class, StringDecoder.class, kafkaParams, topics);
     */

    JavaPairReceiverInputDStream<String, String> stream = KafkaUtils.createStream(javaStreamingContext, args[0],
            args[1], topicMap);

    System.out.println(
            "Got my DStream! connecting to zookeeper " + args[0] + " group " + args[1] + " topics" + topicMap);

    stream.count().print();

    JavaDStream<Tuple11<String, String, String, String, String, String, String, String, String, String, String>> records = stream
            .map(new Function<Tuple2<String, String>, Tuple11<String, String, String, String, String, String, String, String, String, String, String>>() {

                private static final long serialVersionUID = 1L;

                public Tuple11<String, String, String, String, String, String, String, String, String, String, String> call(
                        Tuple2<String, String> defaultKeyAndRecords) throws Exception {

                    String[] fields = defaultKeyAndRecords._2().split(",");

                    return new Tuple11<String, String, String, String, String, String, String, String, String, String, String>(
                            fields[0], fields[1], fields[2], fields[3], fields[4], fields[5], fields[6],
                            fields[7], fields[8], fields[9], fields[10]);
                }
            });

    records.foreachRDD(
            new Function<JavaRDD<Tuple11<String, String, String, String, String, String, String, String, String, String, String>>, Void>() {
                private static final long serialVersionUID = -3333697808496161495L;

                public Void call(
                        JavaRDD<Tuple11<String, String, String, String, String, String, String, String, String, String, String>> rdd)
                        throws Exception {
                    saveToHBasePaymentHistory(rdd, paymentHistoryConfig.getConfiguration());
                    return null;
                }
            });

    JavaPairDStream<String, String> window = records.mapToPair(
            new PairFunction<Tuple11<String, String, String, String, String, String, String, String, String, String, String>, String, String>() {

                private static final long serialVersionUID = -8849699432349098738L;

                public Tuple2<String, String> call(
                        Tuple11<String, String, String, String, String, String, String, String, String, String, String> arg0)
                        throws Exception {

                    String str = arg0._2() + "," + arg0._3() + "," + arg0._4() + "," + arg0._5() + ","
                            + arg0._6() + "," + arg0._7() + "," + arg0._8() + "," + arg0._9() + "," + arg0._10()
                            + "," + arg0._11();

                    return new Tuple2<String, String>(arg0._1(), str);
                }
            }).window(new Duration(60000), new Duration(60000));

    window.saveAsNewAPIHadoopFiles("hdfs://192.168.1.114/user/hadoop/StreamingData/Insurancedata", "",
            Text.class, Text.class, TextOutputFormat.class, hadoopConfig);

    JavaPairDStream<String, Integer> recordsMapState = records.mapToPair(
            new PairFunction<Tuple11<String, String, String, String, String, String, String, String, String, String, String>, String, Integer>() {
                private static final long serialVersionUID = 1L;

                public Tuple2<String, Integer> call(
                        Tuple11<String, String, String, String, String, String, String, String, String, String, String> arg0)
                        throws Exception {
                    String key = arg0._10();
                    Integer value = new Integer(arg0._7());

                    return new Tuple2<String, Integer>(key, value);
                }

            });

    JavaPairDStream<String, Integer> recordsMapUser = records.mapToPair(
            new PairFunction<Tuple11<String, String, String, String, String, String, String, String, String, String, String>, String, Integer>() {
                private static final long serialVersionUID = 1L;

                public Tuple2<String, Integer> call(
                        Tuple11<String, String, String, String, String, String, String, String, String, String, String> arg0)
                        throws Exception {
                    String key = arg0._1();
                    Integer value = new Integer(arg0._7());

                    return new Tuple2<String, Integer>(key, value);
                }

            });

    JavaPairDStream<String, Integer> reduceByKeyAndWindowState = recordsMapState
            .reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {
                private static final long serialVersionUID = 197675516004789269L;

                public Integer call(Integer val1, Integer val2) throws Exception {
                    return val1 + val2;

                }
            }, new Duration(86400000), new Duration(10000));

    JavaPairDStream<String, Integer> reduceByKeyAndWindowUser = recordsMapUser
            .reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {
                private static final long serialVersionUID = 197675516004789269L;

                public Integer call(Integer val1, Integer val2) throws Exception {
                    return val1 + val2;

                }
            }, new Duration(86400000), new Duration(60000));

    // reduce.count();
    reduceByKeyAndWindowState.print();

    reduceByKeyAndWindowState.foreachRDD(new Function<JavaPairRDD<String, Integer>, Void>() {
        private static final long serialVersionUID = 8534726505385048702L;

        public Void call(JavaPairRDD<String, Integer> rdd) throws Exception {
            saveToHBase(rdd, newAPIJobConfigurationState.getConfiguration());
            return null;
        }
    });

    reduceByKeyAndWindowUser.foreachRDD(new Function<JavaPairRDD<String, Integer>, Void>() {
        private static final long serialVersionUID = 8534726505385048702L;

        public Void call(JavaPairRDD<String, Integer> rdd) throws Exception {
            saveToHBase(rdd, newAPIJobConfigurationUser.getConfiguration());
            return null;
        }
    });

    javaStreamingContext.start();
    javaStreamingContext.awaitTermination();
}

From source file:com.xyz.reccommendation.driver.SKU2SKUCount.java

License:Apache License

public static void main(String[] args) throws Exception {

    final Configuration conf = new Configuration();

    String envt = null;//from   w  w w.j  a  v  a2 s .  co  m

    if (args.length > 0) {
        envt = args[0];
    } else {
        envt = "dev";
    }

    Properties prop = new Properties();

    try {
        // load a properties file from class path, inside static method
        prop.load(SKU2SKUCount.class.getClassLoader().getResourceAsStream("config-" + envt + ".properties"));

    } catch (IOException ex) {
        ex.printStackTrace();
        System.exit(1);
    }

    MongoConfigUtil.setOutputURI(conf, "mongodb://" + prop.getProperty("mongodb.ip") + "/"
            + prop.getProperty("mongodb.dbname") + ".out_stat_custom");

    log.debug("MongoDB URL : mongodb://" + prop.getProperty("mongodb.ip") + "/"
            + prop.getProperty("mongodb.dbname") + "." + ".out_stat_custom");

    log.debug("Conf: " + conf);

    MongoConfigUtil.setCreateInputSplits(conf, false);
    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    final Job job = new Job(conf,
            "Count the sku to sku mapping from pview data on hdfs in \"inputPview\" path.");

    job.setJarByClass(SKU2SKUCount.class);

    job.setMapperClass(TokenizerMapper.class);

    // job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BSONWritable.class);

    job.setInputFormatClass(KeyValueTextInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path("inputPview"));

    System.exit(job.waitForCompletion(true) ? 0 : 1);

}

From source file:com.yahoo.druid.hadoop.DruidInputFormatTest.java

License:Apache License

@Test
public void testSampleMRJob() throws Exception {
    Job job = Job.getInstance(new Configuration(), "Druid-Loader-Sample-Test-Job");

    job.getConfiguration().set("mapreduce.job.acl-view-job", "*");
    job.getConfiguration().set("mapreduce.map.java.opts", "-Duser.timezone=UTC");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_OVERLORD_HOSTPORT, "localhost:" + overlordTestPort);
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_SCHEMA,
            "{" + "\"dataSource\":\"testDataSource\","
                    + "\"interval\":\"1970-01-01T00:00:00.000Z/3000-01-01T00:00:00.000Z\","
                    + "\"granularity\":\"NONE\"," + "\"dimensions\":[\"host\"],"
                    + "\"metrics\":[\"visited_sum\",\"unique_hosts\"]" + "}");

    job.setMapperClass(SampleMapper.class);
    job.setNumReduceTasks(0);/*www .  java 2 s . co  m*/

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);

    job.setInputFormatClass(DruidInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    String outputPath = tempFolder.newFolder() + "/out";
    TextOutputFormat.setOutputPath(job, new Path(outputPath));

    Assert.assertTrue(job.waitForCompletion(true));

    //verify that the SampleMapper actually ran and verified the data
    Assert.assertTrue(FileUtils.readFileToString(new File(outputPath + "/part-m-00000")).startsWith("SUCCESS"));
}

From source file:com.yahoo.druid.hadoop.example.SamplePrintMRJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // When implementing tool
    Configuration conf = this.getConf();

    // Create job
    Job job = new Job(conf, "Druid-Loader-Sample-Job");
    job.setJarByClass(SamplePrintMRJob.class);
    //    job.setJobName("Druid-Loader-Sample-Job");

    job.getConfiguration().set("mapreduce.job.acl-view-job", "*");
    job.getConfiguration().set("mapreduce.job.queuename", "default");
    job.getConfiguration().set("mapreduce.map.java.opts", "-Duser.timezone=UTC");
    //job.getConfiguration().set("mapreduce.map.memory.mb", "1024");

    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_STORAGE_STORAGE_DIR, "/tmp/druid/storage");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_OVERLORD_HOSTPORT, "localhost:8080");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_DATASOURCE, "wikipedia");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_INTERVAL,
            "2009-01-01T00:00:00.000/2050-01-01T00:00:00.000");
    job.getConfiguration().set(DruidInputFormat.CONF_DRUID_SCHEMA_FILE, "/tmp/druid/schema/druid_fun_mr.json");

    job.setMapperClass(DruidPrintMapper.class);
    job.setNumReduceTasks(0);//from   ww  w .j  av a  2 s .  co m

    job.setOutputKeyClass(DateTime.class);
    job.setOutputValueClass(Map.class);

    job.setInputFormatClass(DruidInputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);

    System.out.println("Starting Druid Loader Sample Job.....");
    return job.waitForCompletion(true) ? 0 : 1;
    //System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.yahoo.glimmer.indexing.generator.TripleIndexGenerator.java

License:Open Source License

public int run(String[] args) throws Exception {
    SimpleJSAP jsap = new SimpleJSAP(TripleIndexGenerator.class.getName(),
            "Generates a keyword index from RDF data.",
            new Parameter[] {
                    new Switch(NO_CONTEXTS_ARG, 'C', "withoutContexts",
                            "Don't process the contexts for each tuple."),
                    new FlaggedOption(METHOD_ARG, JSAP.STRING_PARSER, "horizontal", JSAP.REQUIRED, 'm',
                            METHOD_ARG, "horizontal or vertical."),
                    new FlaggedOption(PREDICATES_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED,
                            'p', PREDICATES_ARG, "Subset of the properties to be indexed."),
                    new FlaggedOption(RESOURCE_PREFIX_ARG, JSAP.STRING_PARSER, "@", JSAP.NOT_REQUIRED, 'r',
                            RESOURCE_PREFIX_ARG,
                            "Prefix to add to object resource hash values when indexing. Stops queries for numbers matching resource hash values. Default is '@'"),

                    new UnflaggedOption("input", JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the input data."),
                    new UnflaggedOption(NUMBER_OF_DOCS_ARG, JSAP.LONG_PARSER, JSAP.REQUIRED,
                            "Number of documents to index"),
                    new UnflaggedOption("output", JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location for the output."),
                    new UnflaggedOption(RESOURCES_HASH_ARG, JSAP.STRING_PARSER, JSAP.REQUIRED,
                            "HDFS location of the resources hash file."), });

    JSAPResult jsapResult = jsap.parse(args);

    // check whether the command line was valid, and if it wasn't,
    // display usage information and exit.
    if (!jsapResult.success()) {
        System.err.println();//from ww w  .j a  v a2  s .  co  m
        System.err.println("Usage: java " + TripleIndexGenerator.class.getName());
        System.err.println("                " + jsap.getUsage());
        System.err.println();
        System.exit(1);
    }

    Job job = Job.getInstance(getConf());
    job.setJarByClass(TripleIndexGenerator.class);
    job.setJobName("TripleIndexGenerator" + System.currentTimeMillis());

    FileInputFormat.setInputPaths(job, new Path(jsapResult.getString("input")));
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(DocumentMapper.class);
    job.setMapOutputKeyClass(TermKey.class);
    job.setMapOutputValueClass(TermValue.class);

    job.setPartitionerClass(TermKey.FirstPartitioner.class);
    job.setGroupingComparatorClass(TermKey.FirstGroupingComparator.class);

    job.setReducerClass(TermReduce.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IndexRecordWriterValue.class);
    job.setOutputFormatClass(IndexRecordWriter.OutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(jsapResult.getString("output")));

    Configuration conf = job.getConfiguration();

    conf.setClass("mapred.output.key.comparator.class", TermKey.Comparator.class, WritableComparator.class);
    conf.set("mapreduce.user.classpath.first", "true");

    long numDocs = jsapResult.getLong(NUMBER_OF_DOCS_ARG);
    conf.setLong(NUMBER_OF_DOCUMENTS, numDocs);
    // Set this in a attempt to get around the 2GB of ram task limit on our cluster.
    // Setting this in the hope of fixing Direct buffer memory errors
    conf.setInt(INDEX_WRITER_CACHE_SIZE, 1024 * 1024);

    conf.set(OUTPUT_DIR, jsapResult.getString("output"));

    boolean withContexts = !jsapResult.getBoolean(NO_CONTEXTS_ARG, false);
    if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_HORIZONTAL)) {
        HorizontalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG),
                jsapResult.getString(RESOURCE_PREFIX_ARG));
    } else if (jsapResult.getString(METHOD_ARG).equalsIgnoreCase(METHOD_ARG_VALUE_VERTICAL)) {
        if (!jsapResult.contains(PREDICATES_ARG)) {
            throw new IllegalArgumentException("When '" + METHOD_ARG + "' is '" + METHOD_ARG_VALUE_VERTICAL
                    + "' you have to give a predicates file too.");
        }
        VerticalDocumentFactory.setupConf(conf, withContexts, jsapResult.getString(RESOURCES_HASH_ARG),
                jsapResult.getString(RESOURCE_PREFIX_ARG), jsapResult.getString(PREDICATES_ARG));
    } else {
        throw new IllegalArgumentException(METHOD_ARG + " should be '" + METHOD_ARG_VALUE_HORIZONTAL + "' or '"
                + METHOD_ARG_VALUE_VERTICAL + "'");
    }

    conf.setInt("mapreduce.input.linerecordreader.line.maxlength", 1024 * 1024);

    boolean success = job.waitForCompletion(true);

    return success ? 0 : 1;
}