Example usage for org.apache.hadoop.mapred JobConf set

List of usage examples for org.apache.hadoop.mapred JobConf set

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf set.

Prototype

public void set(String name, String value) 

Source Link

Document

Set the value of the name property.

Usage

From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java

License:Apache License

@Test
public void testSchemaEvolutionAndRollbackBlockInLastLogFile() throws Exception {
    // initial commit
    List<String> logFilePaths = new ArrayList<>();
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema());
    HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(),
            HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    int numberOfRecords = 100;
    int numberOfLogRecords = numberOfRecords / 2;
    File partitionDir = InputFormatTestUtil.prepareSimpleParquetDataset(basePath, schema, 1, numberOfRecords,
            commitTime);/*w  w w.  j  av  a2 s.co m*/
    InputFormatTestUtil.commit(basePath, commitTime);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
    List<Field> firstSchemaFields = schema.getFields();

    // update files and generate new log file but don't commit
    schema = SchemaTestUtil.getComplexEvolvedSchema();
    String newCommitTime = "101";
    HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime,
            newCommitTime, numberOfLogRecords, 0, 1);
    long size = writer.getCurrentSize();
    logFilePaths.add(writer.getLogFile().getPath().toString());
    writer.close();
    assertTrue("block - size should be > 0", size > 0);

    // write rollback for the previous block in new log file version
    newCommitTime = "102";
    writer = writeRollbackBlockToLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, "101", 1);
    logFilePaths.add(writer.getLogFile().getPath().toString());
    writer.close();
    assertTrue("block - size should be > 0", size > 0);
    InputFormatTestUtil.deltaCommit(basePath, newCommitTime);

    //create a split with baseFile (parquet file written earlier) and new log file(s)
    HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(
            new FileSplit(new Path(partitionDir + "/fileid0_1_" + commitTime + ".parquet"), 0, 1, jobConf),
            basePath.getRoot().getPath(), logFilePaths, newCommitTime);

    //create a RecordReader to be used by HoodieRealtimeRecordReader
    RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(
            new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null);
    JobConf jobConf = new JobConf();
    List<Schema.Field> fields = schema.getFields();

    assert (firstSchemaFields.containsAll(fields) == false);

    // Try to read all the fields passed by the new schema
    String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(","));
    String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions);
    jobConf.set("partition_columns", "datestr");

    HoodieRealtimeRecordReader recordReader = null;
    try {
        // validate record reader compaction
        recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);
        throw new RuntimeException("should've failed the previous line");
    } catch (HoodieException e) {
        // expected, field not found since the data written with the evolved schema was rolled back
    }

    // Try to read all the fields passed by the new schema
    names = firstSchemaFields.stream().map(f -> f.name()).collect(Collectors.joining(","));
    positions = firstSchemaFields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
    jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions);
    jobConf.set("partition_columns", "datestr");
    // This time read only the fields which are part of parquet
    recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);
    // use reader to read base Parquet File and log file
    NullWritable key = recordReader.createKey();
    ArrayWritable value = recordReader.createValue();
    while (recordReader.next(key, value)) {
        // keep reading
    }
}

From source file:com.xiaoxiaomo.mr.utils.kafka.HadoopJob.java

License:Apache License

public int run(String[] args) throws Exception {
    CommandLineParser parser = new PosixParser();
    Options options = buildOptions();//from  w  ww .  j a v  a  2 s  .  co m
    CommandLine cmd = parser.parse(options, args);

    if (cmd.hasOption("h") || cmd.getArgs().length == 0) {
        printHelpAndExit(options);
    }

    String hdfsPath = cmd.getArgs()[0];
    Configuration conf = getConf();
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);

    if (cmd.hasOption("topics")) {
        LOG.info("Using topics: " + cmd.getOptionValue("topics"));
        KafkaInputFormat.configureKafkaTopics(conf, cmd.getOptionValue("topics"));
    } else {
        printHelpAndExit(options);
    }

    KafkaInputFormat.configureZkConnection(conf, cmd.getOptionValue("zk-connect", "localhost:2181"));
    if (cmd.hasOption("consumer-group")) {
        CheckpointManager.configureUseZooKeeper(conf,
                cmd.getOptionValue("consumer-group", "dev-hadoop-loader"));
    }

    if (cmd.getOptionValue("autooffset-reset") != null) {
        KafkaInputFormat.configureAutoOffsetReset(conf, cmd.getOptionValue("autooffset-reset"));
    }

    JobConf jobConf = new JobConf(conf);
    if (cmd.hasOption("remote")) {
        String ip = cmd.getOptionValue("remote");
        LOG.info("Default file system: hdfs://" + ip + ":8020/");
        jobConf.set("fs.defaultFS", "hdfs://" + ip + ":8020/");
        LOG.info("Remote jobtracker: " + ip + ":8021");
        jobConf.set("mapred.job.tracker", ip + ":8021");
    }

    Path jarTarget = new Path(
            getClass().getProtectionDomain().getCodeSource().getLocation() + "../kafka-hadoop-loader.jar");

    if (new File(jarTarget.toUri()).exists()) {
        // running from IDE/ as maven
        jobConf.setJar(jarTarget.toUri().getPath());
        LOG.info("Using target jar: " + jarTarget.toString());
    } else {
        // running from jar remotely or locally
        jobConf.setJarByClass(getClass());
        LOG.info("Using parent jar: " + jobConf.getJar());
    }

    Job job = Job.getInstance(jobConf, "kafka.hadoop.loader");

    job.setInputFormatClass(KafkaInputFormat.class);
    job.setMapperClass(HadoopJobMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(MultiOutputFormat.class);
    job.setNumReduceTasks(0);

    MultiOutputFormat.setOutputPath(job, new Path(hdfsPath));
    MultiOutputFormat.setCompressOutput(job, cmd.getOptionValue("compress-output", "on").equals("on"));

    LOG.info("Output hdfs location: {}", hdfsPath);
    LOG.info("Output hdfs compression: {}", MultiOutputFormat.getCompressOutput(job));

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.yahoo.druid.hadoop.HiveDatasourceInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    logger.info("checkPost #5");

    String overlordUrl = jobConf.get(CONF_DRUID_OVERLORD_HOSTPORT);
    Preconditions.checkArgument(overlordUrl != null && !overlordUrl.isEmpty(),
            CONF_DRUID_OVERLORD_HOSTPORT + " not defined");

    logger.info("druid overlord url = " + overlordUrl);

    String schemaStr = jobConf.get(CONF_DRUID_SCHEMA);

    Preconditions.checkArgument(schemaStr != null && !schemaStr.isEmpty(),
            "schema undefined,  provide " + CONF_DRUID_SCHEMA);
    logger.info("schema = " + schemaStr);

    DatasourceIngestionSpec ingestionSpec = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(schemaStr,
            DatasourceIngestionSpec.class);
    String segmentsStr = getSegmentsToLoad(ingestionSpec.getDataSource(), ingestionSpec.getIntervals(),
            overlordUrl);//w  ww . java  2  s  .com
    logger.info("segments list received from overlord = " + segmentsStr);

    List<DataSegment> segmentsList = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(segmentsStr,
            new TypeReference<List<DataSegment>>() {
            });
    VersionedIntervalTimeline<String, DataSegment> timeline = new VersionedIntervalTimeline<>(
            Ordering.natural());
    for (DataSegment segment : segmentsList) {
        timeline.add(segment.getInterval(), segment.getVersion(), segment.getShardSpec().createChunk(segment));
    }
    final List<TimelineObjectHolder<String, DataSegment>> timeLineSegments = timeline
            .lookup(ingestionSpec.getIntervals().get(0));
    final List<WindowedDataSegment> windowedSegments = new ArrayList<>();
    for (TimelineObjectHolder<String, DataSegment> holder : timeLineSegments) {
        for (PartitionChunk<DataSegment> chunk : holder.getObject()) {
            windowedSegments.add(new WindowedDataSegment(chunk.getObject(), holder.getInterval()));
        }
    }

    jobConf.set(CONF_INPUT_SEGMENTS, HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString(windowedSegments));

    segmentsStr = Preconditions.checkNotNull(jobConf.get(CONF_INPUT_SEGMENTS), "No segments found to read");
    List<WindowedDataSegment> segments = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(segmentsStr,
            new TypeReference<List<WindowedDataSegment>>() {
            });
    if (segments == null || segments.size() == 0) {
        throw new ISE("No segments found to read");
    }

    logger.info("segments to read " + segmentsStr);

    long maxSize = numSplits;

    if (maxSize > 0) {
        // combining is to happen, let us sort the segments list by size so that
        // they
        // are combined appropriately
        Collections.sort(segments, new Comparator<WindowedDataSegment>() {
            @Override
            public int compare(WindowedDataSegment s1, WindowedDataSegment s2) {
                return Long.compare(s1.getSegment().getSize(), s2.getSegment().getSize());
            }
        });
    }

    List<InputSplit> splits = Lists.newArrayList();

    List<WindowedDataSegment> list = new ArrayList<>();
    long size = 0;

    // JobConf dummyConf = new JobConf();
    Job job = new Job(jobConf);
    JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
    Path[] paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths(jobContext);
    logger.info("dummyPath : " + paths);

    jobConf.set("druid.hive.dummyfilename", paths[0].toString());

    InputFormat fio = supplier.get();
    for (WindowedDataSegment segment : segments) {
        if (size + segment.getSegment().getSize() > maxSize && size > 0) {
            splits.add(toDataSourceSplit(list, fio, jobConf, paths[0]));
            list = Lists.newArrayList();
            size = 0;
        }

        list.add(segment);
        size += segment.getSegment().getSize();
    }

    if (list.size() > 0) {
        splits.add(toDataSourceSplit(list, fio, jobConf, paths[0]));
    }

    logger.info("Number of splits: " + splits.size());
    for (InputSplit split : splits) {
        logger.info(split.getClass().getName());
        for (String location : split.getLocations())
            logger.info(location);
    }
    return Iterables.toArray(splits, InputSplit.class);
}

From source file:com.yolodata.tbana.cascading.csv.CSVLine.java

License:Open Source License

@Override
public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf conf) {
    if (hasZippedFiles(FileInputFormat.getInputPaths(conf)))
        throw new IllegalStateException(
                "cannot read zip files: " + Arrays.toString(FileInputFormat.getInputPaths(conf)));

    conf.set(CSVLineRecordReader.FORMAT_DELIMITER, CSVLineRecordReader.DEFAULT_DELIMITER);
    conf.set(CSVLineRecordReader.FORMAT_SEPARATOR, CSVLineRecordReader.DEFAULT_SEPARATOR);
    conf.setBoolean(CSVLineRecordReader.IS_ZIPFILE, false);
    conf.setInt(CSVNLineInputFormat.LINES_PER_MAP, 40000);

    conf.setInputFormat(CSVNLineInputFormat.class);
}

From source file:com.yolodata.tbana.cascading.shuttl.ShuttlCsv.java

License:Open Source License

@Override
public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf conf) {

    conf.set(ShuttlInputFormatConstants.INDEX_LIST, splunkDataQuery.getIndexesString());
    conf.set(ShuttlInputFormatConstants.EARLIEST_TIME, splunkDataQuery.getEarliestTimeString());
    conf.set(ShuttlInputFormatConstants.LATEST_TIME, splunkDataQuery.getLatestTimeString());
    conf.setInputFormat(ShuttlCSVInputFormat.class);
}

From source file:com.yolodata.tbana.cascading.splunk.SplunkScheme.java

License:Open Source License

@Override
public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf conf) {

    conf.setInputFormat(SplunkInputFormat.class);
    conf.set(SplunkConf.SPLUNK_SEARCH_QUERY, this.splunkDataQuery.getSplunkQuery());
    conf.set(SplunkConf.SPLUNK_EARLIEST_TIME, this.splunkDataQuery.getEarliestTimeString());
    conf.set(SplunkConf.SPLUNK_LATEST_TIME, this.splunkDataQuery.getLatestTimeString());
}

From source file:com.yolodata.tbana.cascading.splunk.SplunkTap.java

License:Open Source License

public void setConfKey(JobConf conf, String key) {
    String value = splunkLogin.getProperty(key, null);
    if (value != null)
        conf.set(key, value);
}

From source file:com.yolodata.tbana.hadoop.mapred.splunk.inputformat.TestMapper.java

License:Open Source License

public int run(String[] args) throws Exception {
    JobConf jobConf = new JobConf(getConf());

    jobConf.set(SplunkInputFormat.INPUTFORMAT_MODE, args[0]);
    jobConf.setJarByClass(SplunkTestRunner.class);
    jobConf.setNumReduceTasks(1);//from  www . j a  v a 2 s . com
    jobConf.setMapperClass(TestMapper.class);
    jobConf.setReducerClass(TestReducer.class);

    jobConf.setInputFormat(SplunkInputFormat.class);
    jobConf.setOutputKeyClass(LongWritable.class);
    jobConf.setOutputValueClass(Text.class);

    TextOutputFormat.setOutputPath(jobConf, new Path(args[1]));

    JobClient.runJob(jobConf);

    return 0;
}

From source file:com.yolodata.tbana.spark.SplunkResultCountExample.java

License:Open Source License

private static void run(JavaSparkContext sparkContext) {
    JobConf conf = new JobConf();

    conf.set(SplunkConf.SPLUNK_USERNAME, "admin");
    conf.set(SplunkConf.SPLUNK_PASSWORD, "changeIt");
    conf.set(SplunkConf.SPLUNK_HOST, "localhost");
    conf.set(SplunkConf.SPLUNK_PORT, "9050");

    SplunkDataQuery query = new SplunkDataQuery();
    conf.set(SplunkConf.SPLUNK_EARLIEST_TIME, query.getEarliestTimeString());
    conf.set(SplunkConf.SPLUNK_LATEST_TIME, query.getLatestTimeString());
    conf.set(SplunkConf.SPLUNK_SEARCH_QUERY, query.getSplunkQuery());

    SplunkRDD rdd = new SplunkRDD(sparkContext.sc(), conf, 2);

    System.out.println("Line count: " + rdd.count());
}

From source file:com.zfylin.demo.bigdata.hadoop.mr.WordCount2.java

License:Apache License

public static void main(String[] args) throws Exception {
    System.setProperty("HADOOP_USER_NAME", "hdfs");

    //?     ???hadoop?
    String input = "hdfs://hadoop-master:8020/data/hive/warehouse/channel_test.db/tbl_student";
    /**/*from   ww  w  .  j a  va 2 s. c  om*/
     * HDFSout
     * ???
     */
    String output = "hdfs://hadoop-master:8020/data/hive/warehouse/channel_test.db/tbl_student/output/";

    JobConf conf = new JobConf(WordCount2.class);
    /**
     * ERROR: Exception message: /bin/bash: line 0: fg: no job control
       */
    conf.set("mapreduce.app-submission.cross-platform", "true");

    conf.setJobName("WordCount");
    //        conf.addResource("classpath:/hadoop/core-site.xml");
    //        conf.addResource("classpath:/hadoop/hdfs-site.xml");
    //        conf.addResource("classpath:/hadoop/mapred-site.xml");
    //??
    conf.setOutputKeyClass(Text.class);
    //?? int
    conf.setOutputValueClass(IntWritable.class);
    //mapper
    conf.setMapperClass(WordCountMapper.class);
    /**
     * ??Reducer
     * ???mapreduce??
     * ????
     * ????
     * ?
     * ???
     * ?????
     * ?
     */
    conf.setCombinerClass(WordCountReducer.class);
    //reduce
    conf.setReducerClass(WordCountReducer.class);
    /**
     * ?TextInputFormat?
     * ????
     * LongWritable????
     * Text
     */
    conf.setInputFormat(TextInputFormat.class);
    /**
     * ?TextOutpuTFormat?
     * ????toString()
     * 
     */
    conf.setOutputFormat(TextOutputFormat.class);
    //?
    FileInputFormat.setInputPaths(conf, new Path(input));
    //???
    FileOutputFormat.setOutputPath(conf, new Path(output));
    //?mapreduce
    JobClient.runJob(conf);
    System.exit(0);
}