List of usage examples for org.apache.hadoop.mapred JobConf set
public void set(String name, String value)
value
of the name
property. From source file:com.uber.hoodie.hadoop.realtime.HoodieRealtimeRecordReaderTest.java
License:Apache License
@Test public void testSchemaEvolutionAndRollbackBlockInLastLogFile() throws Exception { // initial commit List<String> logFilePaths = new ArrayList<>(); Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); HoodieTestUtils.initTableType(hadoopConf, basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ); String commitTime = "100"; int numberOfRecords = 100; int numberOfLogRecords = numberOfRecords / 2; File partitionDir = InputFormatTestUtil.prepareSimpleParquetDataset(basePath, schema, 1, numberOfRecords, commitTime);/*w w w. j av a2 s.co m*/ InputFormatTestUtil.commit(basePath, commitTime); // Add the paths FileInputFormat.setInputPaths(jobConf, partitionDir.getPath()); List<Field> firstSchemaFields = schema.getFields(); // update files and generate new log file but don't commit schema = SchemaTestUtil.getComplexEvolvedSchema(); String newCommitTime = "101"; HoodieLogFormat.Writer writer = writeDataBlockToLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numberOfLogRecords, 0, 1); long size = writer.getCurrentSize(); logFilePaths.add(writer.getLogFile().getPath().toString()); writer.close(); assertTrue("block - size should be > 0", size > 0); // write rollback for the previous block in new log file version newCommitTime = "102"; writer = writeRollbackBlockToLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, "101", 1); logFilePaths.add(writer.getLogFile().getPath().toString()); writer.close(); assertTrue("block - size should be > 0", size > 0); InputFormatTestUtil.deltaCommit(basePath, newCommitTime); //create a split with baseFile (parquet file written earlier) and new log file(s) HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit( new FileSplit(new Path(partitionDir + "/fileid0_1_" + commitTime + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(), logFilePaths, newCommitTime); //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader( new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List<Schema.Field> fields = schema.getFields(); assert (firstSchemaFields.containsAll(fields) == false); // Try to read all the fields passed by the new schema String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(",")); String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); jobConf.set("partition_columns", "datestr"); HoodieRealtimeRecordReader recordReader = null; try { // validate record reader compaction recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); throw new RuntimeException("should've failed the previous line"); } catch (HoodieException e) { // expected, field not found since the data written with the evolved schema was rolled back } // Try to read all the fields passed by the new schema names = firstSchemaFields.stream().map(f -> f.name()).collect(Collectors.joining(",")); positions = firstSchemaFields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(",")); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions); jobConf.set("partition_columns", "datestr"); // This time read only the fields which are part of parquet recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader); // use reader to read base Parquet File and log file NullWritable key = recordReader.createKey(); ArrayWritable value = recordReader.createValue(); while (recordReader.next(key, value)) { // keep reading } }
From source file:com.xiaoxiaomo.mr.utils.kafka.HadoopJob.java
License:Apache License
public int run(String[] args) throws Exception { CommandLineParser parser = new PosixParser(); Options options = buildOptions();//from w ww . j a v a 2 s . co m CommandLine cmd = parser.parse(options, args); if (cmd.hasOption("h") || cmd.getArgs().length == 0) { printHelpAndExit(options); } String hdfsPath = cmd.getArgs()[0]; Configuration conf = getConf(); conf.setBoolean("mapred.map.tasks.speculative.execution", false); if (cmd.hasOption("topics")) { LOG.info("Using topics: " + cmd.getOptionValue("topics")); KafkaInputFormat.configureKafkaTopics(conf, cmd.getOptionValue("topics")); } else { printHelpAndExit(options); } KafkaInputFormat.configureZkConnection(conf, cmd.getOptionValue("zk-connect", "localhost:2181")); if (cmd.hasOption("consumer-group")) { CheckpointManager.configureUseZooKeeper(conf, cmd.getOptionValue("consumer-group", "dev-hadoop-loader")); } if (cmd.getOptionValue("autooffset-reset") != null) { KafkaInputFormat.configureAutoOffsetReset(conf, cmd.getOptionValue("autooffset-reset")); } JobConf jobConf = new JobConf(conf); if (cmd.hasOption("remote")) { String ip = cmd.getOptionValue("remote"); LOG.info("Default file system: hdfs://" + ip + ":8020/"); jobConf.set("fs.defaultFS", "hdfs://" + ip + ":8020/"); LOG.info("Remote jobtracker: " + ip + ":8021"); jobConf.set("mapred.job.tracker", ip + ":8021"); } Path jarTarget = new Path( getClass().getProtectionDomain().getCodeSource().getLocation() + "../kafka-hadoop-loader.jar"); if (new File(jarTarget.toUri()).exists()) { // running from IDE/ as maven jobConf.setJar(jarTarget.toUri().getPath()); LOG.info("Using target jar: " + jarTarget.toString()); } else { // running from jar remotely or locally jobConf.setJarByClass(getClass()); LOG.info("Using parent jar: " + jobConf.getJar()); } Job job = Job.getInstance(jobConf, "kafka.hadoop.loader"); job.setInputFormatClass(KafkaInputFormat.class); job.setMapperClass(HadoopJobMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(MultiOutputFormat.class); job.setNumReduceTasks(0); MultiOutputFormat.setOutputPath(job, new Path(hdfsPath)); MultiOutputFormat.setCompressOutput(job, cmd.getOptionValue("compress-output", "on").equals("on")); LOG.info("Output hdfs location: {}", hdfsPath); LOG.info("Output hdfs compression: {}", MultiOutputFormat.getCompressOutput(job)); return job.waitForCompletion(true) ? 0 : -1; }
From source file:com.yahoo.druid.hadoop.HiveDatasourceInputFormat.java
License:Apache License
@Override public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { logger.info("checkPost #5"); String overlordUrl = jobConf.get(CONF_DRUID_OVERLORD_HOSTPORT); Preconditions.checkArgument(overlordUrl != null && !overlordUrl.isEmpty(), CONF_DRUID_OVERLORD_HOSTPORT + " not defined"); logger.info("druid overlord url = " + overlordUrl); String schemaStr = jobConf.get(CONF_DRUID_SCHEMA); Preconditions.checkArgument(schemaStr != null && !schemaStr.isEmpty(), "schema undefined, provide " + CONF_DRUID_SCHEMA); logger.info("schema = " + schemaStr); DatasourceIngestionSpec ingestionSpec = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(schemaStr, DatasourceIngestionSpec.class); String segmentsStr = getSegmentsToLoad(ingestionSpec.getDataSource(), ingestionSpec.getIntervals(), overlordUrl);//w ww . java 2 s .com logger.info("segments list received from overlord = " + segmentsStr); List<DataSegment> segmentsList = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(segmentsStr, new TypeReference<List<DataSegment>>() { }); VersionedIntervalTimeline<String, DataSegment> timeline = new VersionedIntervalTimeline<>( Ordering.natural()); for (DataSegment segment : segmentsList) { timeline.add(segment.getInterval(), segment.getVersion(), segment.getShardSpec().createChunk(segment)); } final List<TimelineObjectHolder<String, DataSegment>> timeLineSegments = timeline .lookup(ingestionSpec.getIntervals().get(0)); final List<WindowedDataSegment> windowedSegments = new ArrayList<>(); for (TimelineObjectHolder<String, DataSegment> holder : timeLineSegments) { for (PartitionChunk<DataSegment> chunk : holder.getObject()) { windowedSegments.add(new WindowedDataSegment(chunk.getObject(), holder.getInterval())); } } jobConf.set(CONF_INPUT_SEGMENTS, HadoopDruidIndexerConfig.JSON_MAPPER.writeValueAsString(windowedSegments)); segmentsStr = Preconditions.checkNotNull(jobConf.get(CONF_INPUT_SEGMENTS), "No segments found to read"); List<WindowedDataSegment> segments = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(segmentsStr, new TypeReference<List<WindowedDataSegment>>() { }); if (segments == null || segments.size() == 0) { throw new ISE("No segments found to read"); } logger.info("segments to read " + segmentsStr); long maxSize = numSplits; if (maxSize > 0) { // combining is to happen, let us sort the segments list by size so that // they // are combined appropriately Collections.sort(segments, new Comparator<WindowedDataSegment>() { @Override public int compare(WindowedDataSegment s1, WindowedDataSegment s2) { return Long.compare(s1.getSegment().getSize(), s2.getSegment().getSize()); } }); } List<InputSplit> splits = Lists.newArrayList(); List<WindowedDataSegment> list = new ArrayList<>(); long size = 0; // JobConf dummyConf = new JobConf(); Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] paths = org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getInputPaths(jobContext); logger.info("dummyPath : " + paths); jobConf.set("druid.hive.dummyfilename", paths[0].toString()); InputFormat fio = supplier.get(); for (WindowedDataSegment segment : segments) { if (size + segment.getSegment().getSize() > maxSize && size > 0) { splits.add(toDataSourceSplit(list, fio, jobConf, paths[0])); list = Lists.newArrayList(); size = 0; } list.add(segment); size += segment.getSegment().getSize(); } if (list.size() > 0) { splits.add(toDataSourceSplit(list, fio, jobConf, paths[0])); } logger.info("Number of splits: " + splits.size()); for (InputSplit split : splits) { logger.info(split.getClass().getName()); for (String location : split.getLocations()) logger.info(location); } return Iterables.toArray(splits, InputSplit.class); }
From source file:com.yolodata.tbana.cascading.csv.CSVLine.java
License:Open Source License
@Override public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { if (hasZippedFiles(FileInputFormat.getInputPaths(conf))) throw new IllegalStateException( "cannot read zip files: " + Arrays.toString(FileInputFormat.getInputPaths(conf))); conf.set(CSVLineRecordReader.FORMAT_DELIMITER, CSVLineRecordReader.DEFAULT_DELIMITER); conf.set(CSVLineRecordReader.FORMAT_SEPARATOR, CSVLineRecordReader.DEFAULT_SEPARATOR); conf.setBoolean(CSVLineRecordReader.IS_ZIPFILE, false); conf.setInt(CSVNLineInputFormat.LINES_PER_MAP, 40000); conf.setInputFormat(CSVNLineInputFormat.class); }
From source file:com.yolodata.tbana.cascading.shuttl.ShuttlCsv.java
License:Open Source License
@Override public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.set(ShuttlInputFormatConstants.INDEX_LIST, splunkDataQuery.getIndexesString()); conf.set(ShuttlInputFormatConstants.EARLIEST_TIME, splunkDataQuery.getEarliestTimeString()); conf.set(ShuttlInputFormatConstants.LATEST_TIME, splunkDataQuery.getLatestTimeString()); conf.setInputFormat(ShuttlCSVInputFormat.class); }
From source file:com.yolodata.tbana.cascading.splunk.SplunkScheme.java
License:Open Source License
@Override public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf conf) { conf.setInputFormat(SplunkInputFormat.class); conf.set(SplunkConf.SPLUNK_SEARCH_QUERY, this.splunkDataQuery.getSplunkQuery()); conf.set(SplunkConf.SPLUNK_EARLIEST_TIME, this.splunkDataQuery.getEarliestTimeString()); conf.set(SplunkConf.SPLUNK_LATEST_TIME, this.splunkDataQuery.getLatestTimeString()); }
From source file:com.yolodata.tbana.cascading.splunk.SplunkTap.java
License:Open Source License
public void setConfKey(JobConf conf, String key) { String value = splunkLogin.getProperty(key, null); if (value != null) conf.set(key, value); }
From source file:com.yolodata.tbana.hadoop.mapred.splunk.inputformat.TestMapper.java
License:Open Source License
public int run(String[] args) throws Exception { JobConf jobConf = new JobConf(getConf()); jobConf.set(SplunkInputFormat.INPUTFORMAT_MODE, args[0]); jobConf.setJarByClass(SplunkTestRunner.class); jobConf.setNumReduceTasks(1);//from www . j a v a 2 s . com jobConf.setMapperClass(TestMapper.class); jobConf.setReducerClass(TestReducer.class); jobConf.setInputFormat(SplunkInputFormat.class); jobConf.setOutputKeyClass(LongWritable.class); jobConf.setOutputValueClass(Text.class); TextOutputFormat.setOutputPath(jobConf, new Path(args[1])); JobClient.runJob(jobConf); return 0; }
From source file:com.yolodata.tbana.spark.SplunkResultCountExample.java
License:Open Source License
private static void run(JavaSparkContext sparkContext) { JobConf conf = new JobConf(); conf.set(SplunkConf.SPLUNK_USERNAME, "admin"); conf.set(SplunkConf.SPLUNK_PASSWORD, "changeIt"); conf.set(SplunkConf.SPLUNK_HOST, "localhost"); conf.set(SplunkConf.SPLUNK_PORT, "9050"); SplunkDataQuery query = new SplunkDataQuery(); conf.set(SplunkConf.SPLUNK_EARLIEST_TIME, query.getEarliestTimeString()); conf.set(SplunkConf.SPLUNK_LATEST_TIME, query.getLatestTimeString()); conf.set(SplunkConf.SPLUNK_SEARCH_QUERY, query.getSplunkQuery()); SplunkRDD rdd = new SplunkRDD(sparkContext.sc(), conf, 2); System.out.println("Line count: " + rdd.count()); }
From source file:com.zfylin.demo.bigdata.hadoop.mr.WordCount2.java
License:Apache License
public static void main(String[] args) throws Exception { System.setProperty("HADOOP_USER_NAME", "hdfs"); //? ???hadoop? String input = "hdfs://hadoop-master:8020/data/hive/warehouse/channel_test.db/tbl_student"; /**/*from ww w . j a va 2 s. c om*/ * HDFSout * ??? */ String output = "hdfs://hadoop-master:8020/data/hive/warehouse/channel_test.db/tbl_student/output/"; JobConf conf = new JobConf(WordCount2.class); /** * ERROR: Exception message: /bin/bash: line 0: fg: no job control */ conf.set("mapreduce.app-submission.cross-platform", "true"); conf.setJobName("WordCount"); // conf.addResource("classpath:/hadoop/core-site.xml"); // conf.addResource("classpath:/hadoop/hdfs-site.xml"); // conf.addResource("classpath:/hadoop/mapred-site.xml"); //?? conf.setOutputKeyClass(Text.class); //?? int conf.setOutputValueClass(IntWritable.class); //mapper conf.setMapperClass(WordCountMapper.class); /** * ??Reducer * ???mapreduce?? * ???? * ???? * ? * ??? * ????? * ? */ conf.setCombinerClass(WordCountReducer.class); //reduce conf.setReducerClass(WordCountReducer.class); /** * ?TextInputFormat? * ???? * LongWritable???? * Text */ conf.setInputFormat(TextInputFormat.class); /** * ?TextOutpuTFormat? * ????toString() * */ conf.setOutputFormat(TextOutputFormat.class); //? FileInputFormat.setInputPaths(conf, new Path(input)); //??? FileOutputFormat.setOutputPath(conf, new Path(output)); //?mapreduce JobClient.runJob(conf); System.exit(0); }