List of usage examples for org.apache.hadoop.mapreduce Job getConfiguration
public Configuration getConfiguration()
From source file:com.metamx.druid.indexer.DeterminePartitionsJob.java
License:Open Source License
public static void injectSystemProperties(Job job) { final Configuration conf = job.getConfiguration(); for (String propName : System.getProperties().stringPropertyNames()) { if (propName.startsWith("hadoop.")) { conf.set(propName.substring("hadoop.".length()), System.getProperty(propName)); }// ww w .j a v a2 s .co m } }
From source file:com.metamx.druid.indexer.DeterminePartitionsJob.java
License:Open Source License
public boolean run() { try {//from w w w . j a v a 2s . co m /* * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear * in the final segment. */ if (!config.getPartitionsSpec().isAssumeGrouped()) { final Job groupByJob = new Job(new Configuration(), String.format( "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals())); injectSystemProperties(groupByJob); groupByJob.setInputFormatClass(TextInputFormat.class); groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class); groupByJob.setMapOutputKeyClass(BytesWritable.class); groupByJob.setMapOutputValueClass(NullWritable.class); groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class); groupByJob.setOutputKeyClass(BytesWritable.class); groupByJob.setOutputValueClass(NullWritable.class); groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class); groupByJob.setJarByClass(DeterminePartitionsJob.class); config.addInputPaths(groupByJob); config.intoConfiguration(groupByJob); FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir()); groupByJob.submit(); log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL()); if (!groupByJob.waitForCompletion(true)) { log.error("Job failed: %s", groupByJob.getJobID()); return false; } } else { log.info("Skipping group-by job."); } /* * Read grouped data and determine appropriate partitions. */ final Job dimSelectionJob = new Job(new Configuration(), String.format( "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals())); dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19"); injectSystemProperties(dimSelectionJob); if (!config.getPartitionsSpec().isAssumeGrouped()) { // Read grouped data from the groupByJob. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class); dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir()); } else { // Directly read the source data, since we assume it's already grouped. dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class); dimSelectionJob.setInputFormatClass(TextInputFormat.class); config.addInputPaths(dimSelectionJob); } SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob); dimSelectionJob.setMapOutputValueClass(Text.class); dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class); dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class); dimSelectionJob.setOutputKeyClass(BytesWritable.class); dimSelectionJob.setOutputValueClass(Text.class); dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class); dimSelectionJob.setJarByClass(DeterminePartitionsJob.class); config.intoConfiguration(dimSelectionJob); FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath()); dimSelectionJob.submit(); log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL()); if (!dimSelectionJob.waitForCompletion(true)) { log.error("Job failed: %s", dimSelectionJob.getJobID().toString()); return false; } /* * Load partitions determined by the previous job. */ log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals()); FileSystem fileSystem = null; Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance()); int shardCount = 0; for (Interval segmentGranularity : config.getSegmentGranularIntervals()) { DateTime bucket = segmentGranularity.getStart(); final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(new Bucket(0, bucket, 0)); if (fileSystem == null) { fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration()); } if (fileSystem.exists(partitionInfoPath)) { List<ShardSpec> specs = config.jsonMapper.readValue( Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() { }); List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size()); for (int i = 0; i < specs.size(); ++i) { actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++)); log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i)); } shardSpecs.put(bucket, actualSpecs); } else { log.info("Path[%s] didn't exist!?", partitionInfoPath); } } config.setShardSpecs(shardSpecs); return true; } catch (Exception e) { throw Throwables.propagate(e); } }
From source file:com.metamx.druid.indexer.HadoopDruidIndexerConfig.java
License:Open Source License
public void intoConfiguration(Job job) { Configuration conf = job.getConfiguration(); try {// w ww.j ava 2 s . c o m conf.set(CONFIG_PROPERTY, jsonMapper.writeValueAsString(this)); } catch (IOException e) { throw Throwables.propagate(e); } }
From source file:com.metamx.druid.indexer.HadoopDruidIndexerJob.java
License:Open Source License
private void ensurePaths() { // config.addInputPaths() can have side-effects ( boo! :( ), so this stuff needs to be done before anything else try {/*from ww w . j av a2s. c o m*/ Job job = new Job(new Configuration(), String.format("%s-determine_partitions-%s", config.getDataSource(), config.getIntervals())); job.getConfiguration().set("io.sort.record.percent", "0.19"); for (String propName : System.getProperties().stringPropertyNames()) { Configuration conf = job.getConfiguration(); if (propName.startsWith("hadoop.")) { conf.set(propName.substring("hadoop.".length()), System.getProperty(propName)); } } config.addInputPaths(job); } catch (IOException e) { throw Throwables.propagate(e); } }
From source file:com.metamx.druid.indexer.IndexGeneratorJob.java
License:Open Source License
public boolean run() { try {/*from w w w . j a v a2s .co m*/ Job job = new Job(new Configuration(), String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals())); job.getConfiguration().set("io.sort.record.percent", "0.23"); for (String propName : System.getProperties().stringPropertyNames()) { Configuration conf = job.getConfiguration(); if (propName.startsWith("hadoop.")) { conf.set(propName.substring("hadoop.".length()), System.getProperty(propName)); } } job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(IndexGeneratorMapper.class); job.setMapOutputValueClass(Text.class); SortableBytes.useSortableBytesAsMapOutputKey(job); job.setNumReduceTasks(Iterables.size(config.getAllBuckets())); job.setPartitionerClass(IndexGeneratorPartitioner.class); job.setReducerClass(IndexGeneratorReducer.class); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(IndexGeneratorOutputFormat.class); FileOutputFormat.setOutputPath(job, config.makeIntermediatePath()); config.addInputPaths(job); config.intoConfiguration(job); job.setJarByClass(IndexGeneratorJob.class); job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); boolean success = job.waitForCompletion(true); Counter invalidRowCount = job.getCounters() .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER); jobStats.setInvalidRowCount(invalidRowCount.getValue()); return success; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:com.metamx.druid.indexer.path.GranularityPathSpec.java
License:Open Source License
@Override public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException { final Set<Interval> intervals = Sets.newTreeSet(Comparators.intervals()); for (Interval segmentInterval : config.getSegmentGranularIntervals()) { for (Interval dataInterval : dataGranularity.getIterable(segmentInterval)) { intervals.add(dataInterval); }/*from w w w .j a v a 2s. c om*/ } Path betaInput = new Path(inputPath); FileSystem fs = betaInput.getFileSystem(job.getConfiguration()); Set<String> paths = Sets.newTreeSet(); Pattern fileMatcher = Pattern.compile(filePattern); DateTimeFormatter customFormatter = null; if (pathFormat != null) { customFormatter = DateTimeFormat.forPattern(pathFormat); } for (Interval interval : intervals) { DateTime t = interval.getStart(); String intervalPath = null; if (customFormatter != null) { intervalPath = customFormatter.print(t); } else { intervalPath = dataGranularity.toPath(t); } Path granularPath = new Path(betaInput, intervalPath); log.info("Checking path[%s]", granularPath); for (FileStatus status : FSSpideringIterator.spiderIterable(fs, granularPath)) { final Path filePath = status.getPath(); if (fileMatcher.matcher(filePath.toString()).matches()) { paths.add(filePath.toString()); } } } for (String path : paths) { log.info("Appending path[%s]", path); FileInputFormat.addInputPath(job, new Path(path)); } return job; }
From source file:com.metamx.druid.indexer.path.GranularUnprocessedPathSpec.java
License:Open Source License
@Override public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException { // This PathSpec breaks so many abstractions that we might as break some more Preconditions.checkState(config.getGranularitySpec() instanceof UniformGranularitySpec, String.format("Cannot use %s without %s", GranularUnprocessedPathSpec.class.getSimpleName(), UniformGranularitySpec.class.getSimpleName())); final Path betaInput = new Path(getInputPath()); final FileSystem fs = betaInput.getFileSystem(job.getConfiguration()); final Granularity segmentGranularity = ((UniformGranularitySpec) config.getGranularitySpec()) .getGranularity();//from w w w . j av a 2s . c om Map<DateTime, Long> inputModifiedTimes = new TreeMap<DateTime, Long>( Comparators.inverse(Comparators.<Comparable>comparable())); for (FileStatus status : FSSpideringIterator.spiderIterable(fs, betaInput)) { final DateTime key = segmentGranularity.toDate(status.getPath().toString()); final Long currVal = inputModifiedTimes.get(key); final long mTime = status.getModificationTime(); inputModifiedTimes.put(key, currVal == null ? mTime : Math.max(currVal, mTime)); } Set<Interval> bucketsToRun = Sets.newTreeSet(Comparators.intervals()); for (Map.Entry<DateTime, Long> entry : inputModifiedTimes.entrySet()) { DateTime timeBucket = entry.getKey(); long mTime = entry.getValue(); String bucketOutput = String.format("%s/%s", config.getSegmentOutputDir(), segmentGranularity.toPath(timeBucket)); for (FileStatus fileStatus : FSSpideringIterator.spiderIterable(fs, new Path(bucketOutput))) { if (fileStatus.getModificationTime() > mTime) { bucketsToRun.add(new Interval(timeBucket, segmentGranularity.increment(timeBucket))); break; } } if (bucketsToRun.size() >= maxBuckets) { break; } } config.setGranularitySpec(new UniformGranularitySpec(segmentGranularity, Lists.newArrayList(bucketsToRun))); return super.addInputPaths(config, job); }
From source file:com.metamx.milano.pig.MilanoLoadFunc.java
License:Apache License
@Override public void setLocation(String location, Job job) throws IOException { Path basePath = new Path(location); FileSystem fileSystem = basePath.getFileSystem(job.getConfiguration()); Set<Path> paths = new TreeSet<Path>(); if (fileSystem.getFileStatus(basePath).isDir()) { getPaths(basePath, paths, fileSystem); } else {/*from ww w .jav a 2s . co m*/ paths.add(basePath); } log.info("Setting input to " + paths); FileInputFormat.setInputPaths(job, Joiner.on(',').join(paths)); }
From source file:com.metamx.milano.pig.MilanoLoadFunc.java
License:Apache License
/** * This builds a Pig ResourceSchema from the input file(s). This relies on the existence of TypeMetadata. * This is the method by which we pass the schema types and names directly to pig without having to specify them directly. * * @param location As passed to relativeToAbsolutePath * @param job The job.//from ww w . j av a 2 s . com * * @return Returns a ResourceSchema representing the incoming file(s) or null if TypeMetadata does not exist. * * @throws IOException Not thrown directly, but thrown from getMessageSchema where it indicates an unsupported type. */ @Override public ResourceSchema getSchema(String location, Job job) throws IOException { Configuration conf = job.getConfiguration(); Properties props = ConfigurationUtil.toProperties(conf); // HACK: Here we open the file directly to read the TypeMetadata. // HACK: There may be a better more direct way to do this, but it works for now. Path path = new Path(location); FileSystem fileSystem = path.getFileSystem(conf); FileStatus fileStatus = fileSystem.getFileStatus(path); if (fileStatus.isDir()) { log.debug(String.format("Path is a directory.")); path = getFilePath(path, fileSystem); if (path == null) { return null; } } else if (!fileSystem.exists(path)) { return null; } MilanoProtoFile.Reader reader = MilanoProtoFile.createReader(fileSystem.open(path)); typeMetadata = reader.getMetadata(); reader.close(); if (typeMetadata == null) { return null; } descriptor = MilanoTool.with(typeMetadata).getDescriptor(); return new ResourceSchema(getMessageSchema(descriptor)); }
From source file:com.metamx.milano.pig.MilanoStoreFunc.java
License:Apache License
/** * This does the setup for the mapper/reducer side. * * @param location The output path./*from w w w .j av a2s .co m*/ * @param job The job config. * * @throws IOException Currently not thrown, but is part of the overridden signature. */ @Override public void setStoreLocation(String location, Job job) throws IOException { FileOutputFormat.setOutputPath(job, new Path(location)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); Properties props = getUDFProps(); job.getConfiguration().set("com.metamx.milano.proto.descriptor.base64", (String) props.get("milano.pig.proto.schema.base64")); }