Example usage for org.apache.hadoop.mapreduce Job getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getConfiguration.

Prototype

public Configuration getConfiguration()

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.metamx.druid.indexer.DeterminePartitionsJob.java

License:Open Source License

public static void injectSystemProperties(Job job) {
    final Configuration conf = job.getConfiguration();
    for (String propName : System.getProperties().stringPropertyNames()) {
        if (propName.startsWith("hadoop.")) {
            conf.set(propName.substring("hadoop.".length()), System.getProperty(propName));
        }//  ww  w .j  a v a2 s  .co m
    }
}

From source file:com.metamx.druid.indexer.DeterminePartitionsJob.java

License:Open Source License

public boolean run() {
    try {//from   w w w  .  j a v a  2s .  co  m
        /*
         * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
         * in the final segment.
         */

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            final Job groupByJob = new Job(new Configuration(), String.format(
                    "%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));

            injectSystemProperties(groupByJob);
            groupByJob.setInputFormatClass(TextInputFormat.class);
            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            groupByJob.setJarByClass(DeterminePartitionsJob.class);

            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());

            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(),
                    groupByJob.getTrackingURL());

            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                return false;
            }
        } else {
            log.info("Skipping group-by job.");
        }

        /*
         * Read grouped data and determine appropriate partitions.
         */
        final Job dimSelectionJob = new Job(new Configuration(), String.format(
                "%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));

        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");

        injectSystemProperties(dimSelectionJob);

        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            dimSelectionJob.setInputFormatClass(TextInputFormat.class);
            config.addInputPaths(dimSelectionJob);
        }

        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setJarByClass(DeterminePartitionsJob.class);

        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());

        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(),
                dimSelectionJob.getTrackingURL());

        if (!dimSelectionJob.waitForCompletion(true)) {
            log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
            return false;
        }

        /*
         * Load partitions determined by the previous job.
         */

        log.info("Job completed, loading up partitions for intervals[%s].",
                config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<DateTime, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance());
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
            DateTime bucket = segmentGranularity.getStart();

            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(new Bucket(0, bucket, 0));
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (fileSystem.exists(partitionInfoPath)) {
                List<ShardSpec> specs = config.jsonMapper.readValue(
                        Utils.openInputStream(dimSelectionJob, partitionInfoPath),
                        new TypeReference<List<ShardSpec>>() {
                        });

                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
                }

                shardSpecs.put(bucket, actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);

        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}

From source file:com.metamx.druid.indexer.HadoopDruidIndexerConfig.java

License:Open Source License

public void intoConfiguration(Job job) {
    Configuration conf = job.getConfiguration();

    try {//  w  ww.j  ava 2  s . c o  m
        conf.set(CONFIG_PROPERTY, jsonMapper.writeValueAsString(this));
    } catch (IOException e) {
        throw Throwables.propagate(e);
    }
}

From source file:com.metamx.druid.indexer.HadoopDruidIndexerJob.java

License:Open Source License

private void ensurePaths() {
    // config.addInputPaths() can have side-effects ( boo! :( ), so this stuff needs to be done before anything else
    try {/*from   ww w .  j av  a2s.  c  o m*/
        Job job = new Job(new Configuration(),
                String.format("%s-determine_partitions-%s", config.getDataSource(), config.getIntervals()));

        job.getConfiguration().set("io.sort.record.percent", "0.19");
        for (String propName : System.getProperties().stringPropertyNames()) {
            Configuration conf = job.getConfiguration();
            if (propName.startsWith("hadoop.")) {
                conf.set(propName.substring("hadoop.".length()), System.getProperty(propName));
            }
        }

        config.addInputPaths(job);
    } catch (IOException e) {
        throw Throwables.propagate(e);
    }
}

From source file:com.metamx.druid.indexer.IndexGeneratorJob.java

License:Open Source License

public boolean run() {
    try {/*from   w  w  w .  j  a v a2s .co m*/
        Job job = new Job(new Configuration(),
                String.format("%s-index-generator-%s", config.getDataSource(), config.getIntervals()));

        job.getConfiguration().set("io.sort.record.percent", "0.23");

        for (String propName : System.getProperties().stringPropertyNames()) {
            Configuration conf = job.getConfiguration();
            if (propName.startsWith("hadoop.")) {
                conf.set(propName.substring("hadoop.".length()), System.getProperty(propName));
            }
        }

        job.setInputFormatClass(TextInputFormat.class);

        job.setMapperClass(IndexGeneratorMapper.class);
        job.setMapOutputValueClass(Text.class);

        SortableBytes.useSortableBytesAsMapOutputKey(job);

        job.setNumReduceTasks(Iterables.size(config.getAllBuckets()));
        job.setPartitionerClass(IndexGeneratorPartitioner.class);

        job.setReducerClass(IndexGeneratorReducer.class);
        job.setOutputKeyClass(BytesWritable.class);
        job.setOutputValueClass(Text.class);
        job.setOutputFormatClass(IndexGeneratorOutputFormat.class);
        FileOutputFormat.setOutputPath(job, config.makeIntermediatePath());

        config.addInputPaths(job);
        config.intoConfiguration(job);

        job.setJarByClass(IndexGeneratorJob.class);

        job.submit();
        log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());

        boolean success = job.waitForCompletion(true);

        Counter invalidRowCount = job.getCounters()
                .findCounter(HadoopDruidIndexerConfig.IndexJobCounters.INVALID_ROW_COUNTER);
        jobStats.setInvalidRowCount(invalidRowCount.getValue());

        return success;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.metamx.druid.indexer.path.GranularityPathSpec.java

License:Open Source License

@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
    final Set<Interval> intervals = Sets.newTreeSet(Comparators.intervals());
    for (Interval segmentInterval : config.getSegmentGranularIntervals()) {
        for (Interval dataInterval : dataGranularity.getIterable(segmentInterval)) {
            intervals.add(dataInterval);
        }/*from  w  w  w .j  a v a 2s. c  om*/
    }

    Path betaInput = new Path(inputPath);
    FileSystem fs = betaInput.getFileSystem(job.getConfiguration());
    Set<String> paths = Sets.newTreeSet();
    Pattern fileMatcher = Pattern.compile(filePattern);

    DateTimeFormatter customFormatter = null;
    if (pathFormat != null) {
        customFormatter = DateTimeFormat.forPattern(pathFormat);
    }

    for (Interval interval : intervals) {
        DateTime t = interval.getStart();
        String intervalPath = null;
        if (customFormatter != null) {
            intervalPath = customFormatter.print(t);
        } else {
            intervalPath = dataGranularity.toPath(t);
        }

        Path granularPath = new Path(betaInput, intervalPath);
        log.info("Checking path[%s]", granularPath);
        for (FileStatus status : FSSpideringIterator.spiderIterable(fs, granularPath)) {
            final Path filePath = status.getPath();
            if (fileMatcher.matcher(filePath.toString()).matches()) {
                paths.add(filePath.toString());
            }
        }
    }

    for (String path : paths) {
        log.info("Appending path[%s]", path);
        FileInputFormat.addInputPath(job, new Path(path));
    }

    return job;
}

From source file:com.metamx.druid.indexer.path.GranularUnprocessedPathSpec.java

License:Open Source License

@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
    // This PathSpec breaks so many abstractions that we might as break some more
    Preconditions.checkState(config.getGranularitySpec() instanceof UniformGranularitySpec,
            String.format("Cannot use %s without %s", GranularUnprocessedPathSpec.class.getSimpleName(),
                    UniformGranularitySpec.class.getSimpleName()));

    final Path betaInput = new Path(getInputPath());
    final FileSystem fs = betaInput.getFileSystem(job.getConfiguration());
    final Granularity segmentGranularity = ((UniformGranularitySpec) config.getGranularitySpec())
            .getGranularity();//from   w w  w .  j  av  a  2s  . c om

    Map<DateTime, Long> inputModifiedTimes = new TreeMap<DateTime, Long>(
            Comparators.inverse(Comparators.<Comparable>comparable()));

    for (FileStatus status : FSSpideringIterator.spiderIterable(fs, betaInput)) {
        final DateTime key = segmentGranularity.toDate(status.getPath().toString());
        final Long currVal = inputModifiedTimes.get(key);
        final long mTime = status.getModificationTime();

        inputModifiedTimes.put(key, currVal == null ? mTime : Math.max(currVal, mTime));
    }

    Set<Interval> bucketsToRun = Sets.newTreeSet(Comparators.intervals());
    for (Map.Entry<DateTime, Long> entry : inputModifiedTimes.entrySet()) {
        DateTime timeBucket = entry.getKey();
        long mTime = entry.getValue();

        String bucketOutput = String.format("%s/%s", config.getSegmentOutputDir(),
                segmentGranularity.toPath(timeBucket));
        for (FileStatus fileStatus : FSSpideringIterator.spiderIterable(fs, new Path(bucketOutput))) {
            if (fileStatus.getModificationTime() > mTime) {
                bucketsToRun.add(new Interval(timeBucket, segmentGranularity.increment(timeBucket)));
                break;
            }
        }

        if (bucketsToRun.size() >= maxBuckets) {
            break;
        }
    }

    config.setGranularitySpec(new UniformGranularitySpec(segmentGranularity, Lists.newArrayList(bucketsToRun)));

    return super.addInputPaths(config, job);
}

From source file:com.metamx.milano.pig.MilanoLoadFunc.java

License:Apache License

@Override
public void setLocation(String location, Job job) throws IOException {
    Path basePath = new Path(location);
    FileSystem fileSystem = basePath.getFileSystem(job.getConfiguration());

    Set<Path> paths = new TreeSet<Path>();

    if (fileSystem.getFileStatus(basePath).isDir()) {
        getPaths(basePath, paths, fileSystem);
    } else {/*from   ww  w .jav a 2s .  co m*/
        paths.add(basePath);
    }

    log.info("Setting input to " + paths);
    FileInputFormat.setInputPaths(job, Joiner.on(',').join(paths));
}

From source file:com.metamx.milano.pig.MilanoLoadFunc.java

License:Apache License

/**
 * This builds a Pig ResourceSchema from the input file(s). This relies on the existence of TypeMetadata.
 * This is the method by which we pass the schema types and names directly to pig without having to specify them directly.
 *
 * @param location As passed to relativeToAbsolutePath
 * @param job      The job.//from  ww  w  . j av  a  2  s . com
 *
 * @return Returns a ResourceSchema representing the incoming file(s) or null if TypeMetadata does not exist.
 *
 * @throws IOException Not thrown directly, but thrown from getMessageSchema where it indicates an unsupported type.
 */
@Override
public ResourceSchema getSchema(String location, Job job) throws IOException {
    Configuration conf = job.getConfiguration();
    Properties props = ConfigurationUtil.toProperties(conf);

    // HACK: Here we open the file directly to read the TypeMetadata.
    // HACK: There may be a better more direct way to do this, but it works for now.
    Path path = new Path(location);
    FileSystem fileSystem = path.getFileSystem(conf);

    FileStatus fileStatus = fileSystem.getFileStatus(path);
    if (fileStatus.isDir()) {
        log.debug(String.format("Path is a directory."));
        path = getFilePath(path, fileSystem);
        if (path == null) {
            return null;
        }
    } else if (!fileSystem.exists(path)) {
        return null;
    }

    MilanoProtoFile.Reader reader = MilanoProtoFile.createReader(fileSystem.open(path));
    typeMetadata = reader.getMetadata();
    reader.close();

    if (typeMetadata == null) {
        return null;
    }
    descriptor = MilanoTool.with(typeMetadata).getDescriptor();

    return new ResourceSchema(getMessageSchema(descriptor));
}

From source file:com.metamx.milano.pig.MilanoStoreFunc.java

License:Apache License

/**
 * This does the setup for the mapper/reducer side.
 *
 * @param location The output path./*from   w w  w  .j av a2s .co  m*/
 * @param job      The job config.
 *
 * @throws IOException Currently not thrown, but is part of the overridden signature.
 */
@Override
public void setStoreLocation(String location, Job job) throws IOException {
    FileOutputFormat.setOutputPath(job, new Path(location));
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    Properties props = getUDFProps();

    job.getConfiguration().set("com.metamx.milano.proto.descriptor.base64",
            (String) props.get("milano.pig.proto.schema.base64"));
}