Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.howbuy.hadoop.mr.online.SecondarySort.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysrot <in> <out>");
        System.exit(2);/*from   www  .j a v a2 s  . co m*/
    }
    Job job = new Job(conf, "secondary sort");
    job.setJarByClass(SecondarySort.class);
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    // group and partition by the first int in the pair
    job.setPartitionerClass(FirstPartitioner.class);
    job.setGroupingComparatorClass(FirstGroupingComparator.class);

    // the map output is IntPair, IntWritable
    job.setMapOutputKeyClass(IntPair.class);
    job.setMapOutputValueClass(IntWritable.class);

    // the reduce output is Text, IntWritable
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(TextInputFormat.class);
    // job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setNumReduceTasks(3);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.iflytek.spider.crawl.CrawlDb.java

License:Apache License

public static Job createJob(Configuration config, Path crawlDb) throws IOException {
    Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    Job job = AvroJob.getAvroJob(config);
    job.setJobName("crawldb " + crawlDb);

    Path current = new Path(crawlDb, CURRENT_NAME);
    if (FileSystem.get(config).exists(current)) {
        FileInputFormat.addInputPath(job, current);
    }/*from  ww  w.j  av  a2  s.c  o  m*/
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(CrawlDbFilter.class);
    job.setReducerClass(CrawlDbReducer.class);

    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormatClass(AvroMapOutputFormat.class);
    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(CrawlDatum.class);

    return job;
}

From source file:com.iflytek.spider.crawl.GeneratorSmart.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation./*from   w w w.j a va  2s .co m*/
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force)
        throws IOException, InterruptedException, ClassNotFoundException {
    //getConf().set("mapred.temp.dir", "d:/tmp");
    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: starting");

    Job job = AvroJob.getAvroJob(getConf());
    if (numLists == -1) { // for politeness make
        numLists = job.getNumReduceTasks(); // a partition per fetch task
    }
    if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    LOG.info("Generator: with " + numLists + " partition.");
    job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(SelectorMapper.class);
    job.setReducerClass(SelectorReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    //job.setOutputFormatClass(AvroPairOutputFormat.class);
    job.setOutputFormatClass(GeneratorOutputFormat.class);
    job.setOutputKeyClass(Float.class);
    job.setOutputValueClass(SelectorEntry.class);
    // AvroMultipleOutputs.addNamedOutput(job, "seq",
    // AvroPairOutputFormat.class, Float.class, SelectorEntry.class);
    try {
        job.waitForCompletion(true);
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);

            fs.createNewFile(new Path(newSeg, "generatored"));
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        job = AvroJob.getAvroJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormatClass(AvroPairInputFormat.class);
        job.setMapperClass(CrawlDbUpdateMapper.class);
        // job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormatClass(AvroMapOutputFormat.class);
        job.setOutputKeyClass(String.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            job.waitForCompletion(true);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    if (LOG.isInfoEnabled()) {
        LOG.info("Generator: done.");
    }
    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:com.iflytek.spider.crawl.GeneratorSmart.java

License:Apache License

private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir, int numLists)
        throws IOException, InterruptedException, ClassNotFoundException {
    // invert again, partition by host/domain/IP, sort by url hash
    if (LOG.isInfoEnabled()) {
        LOG.info("Generator: Partitioning selected urls for politeness:" + inputDir);
    }/*from w w  w . j a v a 2s.c  om*/
    Path segment = new Path(segmentsDir, generateSegmentName());
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);

    LOG.info("Generator: segment: " + segment + " with " + numLists + " Fetchers");

    Job job = AvroJob.getAvroJob(getConf());
    job.setJobName("generate: partition " + segment);
    job.getConfiguration().setInt("partition.url.seed", new Random().nextInt());

    FileInputFormat.addInputPath(job, inputDir);
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(SelectorInverseMapper.class);
    job.setPartitionerClass(AveragePartition.class);
    job.setMapOutputKeyClass(String.class);
    job.setMapOutputValueClass(SelectorEntry.class);
    job.setReducerClass(PartitionReducer.class);
    job.setNumReduceTasks(numLists);

    FileOutputFormat.setOutputPath(job, output);
    job.setOutputFormatClass(AvroPairOutputFormat.class);
    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.waitForCompletion(true);
    return segment;
}

From source file:com.iflytek.spider.parse.ParseSegment.java

License:Apache License

public void parse(Path segment) throws IOException, InterruptedException, ClassNotFoundException {

    if (LOG.isInfoEnabled()) {
        LOG.info("Parse: starting");
        LOG.info("Parse: segment: " + segment);
    }/*from   ww  w.j  a v a  2  s .  c om*/

    Job job = AvroJob.getAvroJob(getConf());
    job.setJobName("parse " + segment);

    FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
    job.getConfiguration().set(Spider.SEGMENT_NAME_KEY, segment.getName());

    job.setInputFormatClass(AvroPairInputFormat.class);
    job.setMapperClass(ParseMapper.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormatClass(ParseOutputFormat.class);
    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(UnionData.class);

    job.waitForCompletion(true);
    if (LOG.isInfoEnabled()) {
        LOG.info("Parse: done");
    }
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.Aleph2MultiInputFormatBuilder.java

License:Apache License

/** Sets the output configurations in the job 
 * @param job//  ww  w  .j  a v  a 2  s . c o  m
 */
public Job build(final Job job) {

    job.getConfiguration().set(ALEPH2_MULTI_INPUT_FORMAT_JOBS,
            _inputs.keySet().stream().collect(Collectors.joining(",")));
    _inputs.entrySet().stream().forEach(Lambdas.wrap_consumer_u(kv -> {
        try (final Stringifier<Configuration> stringifier = new DefaultStringifier<Configuration>(
                job.getConfiguration(), Configuration.class)) {
            final Configuration new_config = new Configuration(kv.getValue().getConfiguration());
            new_config.set(ALEPH2_MULTI_INPUT_FORMAT_CLAZZ, kv.getValue().getInputFormatClass().getName());
            job.getConfiguration().set(ALEPH2_MULTI_INPUT_FORMAT_PREFIX + kv.getKey(),
                    stringifier.toString(new_config));
        }
    }));
    job.setInputFormatClass(Aleph2MultiInputFormat.class);
    return job;
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.VerySimpleLocalExample.java

License:Apache License

@SuppressWarnings({ "deprecation", "unchecked", "rawtypes" })
@Test//from w  ww. j  a va 2 s . c o m
public void test_localHadoopLaunch()
        throws IOException, IllegalStateException, ClassNotFoundException, InterruptedException {

    // 0) Setup the temp dir 
    final String temp_dir = System.getProperty("java.io.tmpdir") + File.separator;
    //final Path tmp_path = FileContext.getLocalFSFileContext().makeQualified(new Path(temp_dir));
    final Path tmp_path2 = FileContext.getLocalFSFileContext()
            .makeQualified(new Path(temp_dir + "/tmp_output"));
    try {
        FileContext.getLocalFSFileContext().delete(tmp_path2, true);
    } catch (Exception e) {
    } // (just doesn't exist yet)

    // 1) Setup config with local mode
    final Configuration config = new Configuration();
    config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing)
    config.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
    config.set("mapred.job.tracker", "local");
    config.set("fs.defaultFS", "local");
    config.unset("mapreduce.framework.name");

    // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place
    config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");

    // 2) Build job and do more setup using the Job API
    //TODO: not sure why this is deprecated, it doesn't seem to be in v1? We do need to move to JobConf at some point, but I ran into some 
    // issues when trying to do everything I needed to for V1, so seems expedient to start here and migrate away
    final Job hj = new Job(config); // (NOTE: from here, changes to config are ignored)

    // Input format:
    //TOOD: fails because of guava issue, looks like we'll need to move to 2.7 and check it works with 2.5.x server?
    //TextInputFormat.addInputPath(hj, tmp_path);
    //hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName ("org.apache.hadoop.mapreduce.lib.input.TextInputFormat"));
    hj.setInputFormatClass(TestInputFormat.class);

    // Output format:
    hj.setOutputFormatClass((Class<? extends OutputFormat>) Class
            .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat"));
    TextOutputFormat.setOutputPath(hj, tmp_path2);

    // Mapper etc (combiner/reducer are similar)
    hj.setMapperClass(TestMapper.class);
    hj.setOutputKeyClass(Text.class);
    hj.setOutputValueClass(Text.class);
    hj.setNumReduceTasks(0); // (disable reducer for now)

    hj.setJar("test");

    try {
        hj.submit();
    } catch (UnsatisfiedLinkError e) {
        throw new RuntimeException(
                "This is a windows/hadoop compatibility problem - adding the hadoop-commons in the misc_test_assets subdirectory to the top of the classpath should resolve it (and does in V1), though I haven't yet made that work with Aleph2",
                e);
    }
    //hj.getJobID().toString();
    while (!hj.isComplete()) {
        Thread.sleep(1000);
    }
    assertTrue("Finished successfully", hj.isSuccessful());
}

From source file:com.ikanow.aleph2.analytics.hadoop.services.BeJobLauncher.java

License:Open Source License

@Override
public Validation<String, Job> runEnhancementJob(final DataBucketBean bucket,
        final Optional<ProcessingTestSpecBean> testSpec) {

    final Configuration config = getHadoopConfig();

    final ClassLoader currentClassloader = Thread.currentThread().getContextClassLoader();
    //(not currently used, but has proven useful in the past)

    try {//from  w w  w  . j av  a 2s.  c  o m
        final String contextSignature = _batchEnrichmentContext
                .getEnrichmentContextSignature(Optional.of(bucket), Optional.empty());
        config.set(BatchEnrichmentJob.BE_CONTEXT_SIGNATURE, contextSignature);

        final Optional<Long> debug_max = testSpec
                .flatMap(testSpecVals -> Optional.ofNullable(testSpecVals.requested_num_objects()));

        //then gets applied to all the inputs:
        debug_max.ifPresent(val -> config.set(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE, val.toString()));

        final Aleph2MultiInputFormatBuilder inputBuilder = new Aleph2MultiInputFormatBuilder();

        // Create a separate InputFormat for every input (makes testing life easier)

        Optional.ofNullable(_batchEnrichmentContext.getJob().inputs()).orElse(Collections.emptyList()).stream()
                .forEach(Lambdas.wrap_consumer_u(input -> {
                    final List<String> paths = _batchEnrichmentContext.getAnalyticsContext()
                            .getInputPaths(Optional.of(bucket), _batchEnrichmentContext.getJob(), input);
                    final Job inputJob = Job.getInstance(config);
                    inputJob.setInputFormatClass(BeFileInputFormat.class);
                    paths.stream().forEach(Lambdas
                            .wrap_consumer_u(path -> FileInputFormat.addInputPath(inputJob, new Path(path))));
                    inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob);
                }));
        ;

        // (ALEPH-12): other input format types

        // Now do everything else

        final String jobName = BucketUtils.getUniqueSignature(bucket.full_name(),
                Optional.ofNullable(_batchEnrichmentContext.getJob().name()));

        // do not set anything into config past this line (can set job.getConfiguration() elements though - that is what the builder does)
        Job job = Job.getInstance(config, jobName);
        job.setJarByClass(BatchEnrichmentJob.class);

        // Set the classpath

        cacheJars(job, bucket, _batchEnrichmentContext.getAnalyticsContext());

        // (generic mapper - the actual code is run using the classes in the shared libraries)
        job.setMapperClass(BatchEnrichmentJob.BatchEnrichmentMapper.class);

        //TODO: ALEPH-12 handle reducer scenarios
        job.setNumReduceTasks(0);
        //job.setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class);

        // Input format:
        inputBuilder.build(job);

        // Output format (doesn't really do anything, all the actual output code is performed by the mapper via the enrichment context)
        job.setOutputFormatClass(BeFileOutputFormat.class);

        launch(job);
        return Validation.success(job);

    } catch (Throwable t) {
        logger.error("Caught Exception", t);
        return Validation.fail(ErrorUtils.getLongForm("{0}", t));
    } finally {
        Thread.currentThread().setContextClassLoader(currentClassloader);
    }

}

From source file:com.ikanow.aleph2.analytics.r.services.BeJobLauncher.java

License:Apache License

@Override
public Validation<String, Job> runEnhancementJob(final DataBucketBean bucket,
        final Optional<ProcessingTestSpecBean> testSpec) {

    final Configuration config = getHadoopConfig();

    final ClassLoader currentClassloader = Thread.currentThread().getContextClassLoader();
    //(not currently used, but has proven useful in the past)

    final SetOnce<Job> job = new SetOnce<>();
    try {/*from w ww. j  av  a 2  s  .  co m*/
        final Optional<Long> debug_max = testSpec
                .flatMap(testSpecVals -> Optional.ofNullable(testSpecVals.requested_num_objects()));

        //then gets applied to all the inputs:
        debug_max.ifPresent(val -> config.set(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE, val.toString()));

        final Aleph2MultiInputFormatBuilder inputBuilder = new Aleph2MultiInputFormatBuilder();

        // Validation:

        try {
            final BatchEnrichmentJob.BatchEnrichmentBaseValidator validator = new BatchEnrichmentJob.BatchEnrichmentBaseValidator();
            validator.setDataBucket(bucket);
            validator.setEnrichmentContext(_batchEnrichmentContext);
            validator.setEcMetadata(
                    Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList()));
            final List<BasicMessageBean> errs = validator.validate();
            if (errs.stream().anyMatch(b -> !b.success())) {
                return Validation.fail(ErrorUtils.get("Validation errors for {0}: {1}", bucket.full_name(),
                        errs.stream().map(
                                b -> ErrorUtils.get("{0}: {1}", b.success() ? "INFO" : "ERROR", b.message()))
                                .collect(Collectors.joining(";"))));
            }
        } catch (Throwable t) { // we'll log but carry on in this case...(in case there's some classloading shenanigans which won't affect the operation in hadoop)
            logger.error(
                    ErrorUtils.getLongForm("Failed validation, bucket: {1} error: {0}", t, bucket.full_name()));
        }

        // Create a separate InputFormat for every input (makes testing life easier)

        Optional.ofNullable(_batchEnrichmentContext.getJob().inputs()).orElse(Collections.emptyList()).stream()
                .filter(input -> Optional.ofNullable(input.enabled()).orElse(true))
                .forEach(Lambdas.wrap_consumer_u(input -> {
                    // In the debug case, transform the input to add the max record limit
                    final AnalyticThreadJobInputBean input_with_test_settings = BeanTemplateUtils.clone(input)
                            .with(AnalyticThreadJobInputBean::config, BeanTemplateUtils
                                    .clone(Optional.ofNullable(input.config()).orElseGet(() -> BeanTemplateUtils
                                            .build(AnalyticThreadJobInputConfigBean.class).done().get()))
                                    .with(AnalyticThreadJobInputConfigBean::test_record_limit_request, //(if not test, always null; else "input override" or "output default")
                                            debug_max.map(max -> Optionals
                                                    .of(() -> input.config().test_record_limit_request())
                                                    .orElse(max)).orElse(null))
                                    .done())
                            .done();

                    // Get the paths and add them to a list for later
                    final List<String> paths = _batchEnrichmentContext.getAnalyticsContext().getInputPaths(
                            Optional.of(bucket), _batchEnrichmentContext.getJob(), input_with_test_settings);

                    RScriptUtils.addFilePaths(paths);

                    if (!paths.isEmpty()) {

                        logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}",
                                bucket.full_name(), paths.stream().collect(Collectors.joining(";"))));

                        final Job inputJob = Job.getInstance(config);
                        inputJob.setInputFormatClass(BeFileInputFormat.class);
                        paths.stream().forEach(Lambdas.wrap_consumer_u(
                                path -> FileInputFormat.addInputPath(inputJob, new Path(path))));
                        inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob);
                    } else { // not easily available in HDFS directory format, try getting from the context

                        Optional<HadoopAccessContext> input_format_info = _batchEnrichmentContext
                                .getAnalyticsContext().getServiceInput(HadoopAccessContext.class,
                                        Optional.of(bucket), _batchEnrichmentContext.getJob(),
                                        input_with_test_settings);
                        if (!input_format_info.isPresent()) {
                            logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}",
                                    BeanTemplateUtils.toJson(input_with_test_settings)));
                        } else {
                            logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}",
                                    bucket.full_name(), input_format_info.get().describe()));

                            final Job inputJob = Job.getInstance(config);
                            inputJob.setInputFormatClass(input_format_info.get().getAccessService()
                                    .either(l -> l.getClass(), r -> r));
                            input_format_info.get().getAccessConfig().ifPresent(map -> {
                                map.entrySet().forEach(kv -> inputJob.getConfiguration().set(kv.getKey(),
                                        kv.getValue().toString()));
                            });

                            inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob);
                        }
                    }

                }));

        // (ALEPH-12): other input format types

        // Now do everything else

        final String contextSignature = _batchEnrichmentContext
                .getEnrichmentContextSignature(Optional.of(bucket), Optional.empty());
        config.set(BatchEnrichmentJob.BE_CONTEXT_SIGNATURE, contextSignature);

        final String jobName = BucketUtils.getUniqueSignature(bucket.full_name(),
                Optional.ofNullable(_batchEnrichmentContext.getJob().name()));

        this.handleHadoopConfigOverrides(bucket, config);

        // do not set anything into config past this line (can set job.getConfiguration() elements though - that is what the builder does)
        job.set(Job.getInstance(config, jobName));
        job.get().setJarByClass(BatchEnrichmentJob.class);
        job.get().setSortComparatorClass(ObjectNodeWritableComparable.Comparator.class); //(avoid deser of json node for intermediate things)

        // Set the classpath

        cacheJars(job.get(), bucket, _batchEnrichmentContext.getAnalyticsContext());

        // (generic mapper - the actual code is run using the classes in the shared libraries)
        job.get().setMapperClass(BatchEnrichmentJob.BatchEnrichmentMapper.class);
        job.get().setMapOutputKeyClass(ObjectNodeWritableComparable.class);
        job.get().setMapOutputValueClass(ObjectNodeWritableComparable.class);

        // (combiner and reducer)
        Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList()).stream()
                .filter(cfg -> Optional.ofNullable(cfg.enabled()).orElse(true))
                .filter(cfg -> !Optionals.ofNullable(cfg.grouping_fields()).isEmpty()).findAny().map(cfg -> {
                    final HadoopTechnologyOverrideBean tech_override = BeanTemplateUtils
                            .from(Optional.ofNullable(cfg.technology_override()).orElse(Collections.emptyMap()),
                                    HadoopTechnologyOverrideBean.class)
                            .get();

                    job.get().setNumReduceTasks(Optional.ofNullable(tech_override.num_reducers()).orElse(2));
                    job.get().setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class);

                    if (tech_override.use_combiner()) {
                        job.get().setCombinerClass(BatchEnrichmentJob.BatchEnrichmentCombiner.class);
                    }
                    return Unit.unit();
                }).orElseGet(() -> {
                    job.get().setNumReduceTasks(0);
                    return Unit.unit();
                });

        // job.setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class);

        // Input format:
        inputBuilder.build(job.get());

        // Output format (doesn't really do anything, all the actual output code is performed by the mapper via the enrichment context)
        job.get().setOutputFormatClass(BeFileOutputFormat.class);

        // Submit the job for processing
        launch(job.get());

        // Wait for the job to complete and collect the data
        //            job.get().waitForCompletion(true);

        return Validation.success(job.get());

    } catch (Throwable t) {
        Throwable tt = (t instanceof RuntimeException) ? (null != t.getCause()) ? t.getCause() : t : t;

        if (tt instanceof org.apache.hadoop.mapreduce.lib.input.InvalidInputException) {
            // Probably a benign "no matching paths", so return pithy error
            return Validation.fail(ErrorUtils.get("{0}", tt.getMessage()));
        } else { // General error : Dump the config params to string         
            if (job.isSet()) {
                logger.error(ErrorUtils.get("Error submitting, config= {0}",
                        Optionals.streamOf(job.get().getConfiguration().iterator(), false)
                                .map(kv -> kv.getKey() + ":" + kv.getValue())
                                .collect(Collectors.joining("; "))));
            }
            return Validation.fail(ErrorUtils.getLongForm("{0}", tt));
        }
    } finally {
        Thread.currentThread().setContextClassLoader(currentClassloader);
    }

}

From source file:com.ikanow.aleph2.analytics.spark.utils.SparkTechnologyUtils.java

License:Apache License

/** Builds objects for all the aleph2 inputs and provides a method to use them in context-dependent ways 
 * @param context//from   www. j a  v a2 s  .c o m
 * @param bucket
 * @param job
 * @param config
 * @param per_input_action - user lambda that determines how they are used
 */
public static final void buildAleph2Inputs(final IAnalyticsContext context, final DataBucketBean bucket,
        final AnalyticThreadJobBean job, final Optional<ProcessingTestSpecBean> maybe_test_spec,
        final Configuration config, final Set<String> exclude_names,
        BiConsumer<AnalyticThreadJobInputBean, Job> per_input_action) {
    transformInputBean(Optionals.ofNullable(job.inputs()).stream(), maybe_test_spec)
            .filter(input -> !exclude_names.contains(input.name()))
            .forEach(Lambdas.wrap_consumer_u(input_with_test_settings -> {

                final Optional<IBucketLogger> a2_logger = Optional
                        .ofNullable(context.getLogger(Optional.of(bucket)));

                final List<String> paths = context.getInputPaths(Optional.empty(), job,
                        input_with_test_settings);

                if (!paths.isEmpty()) {

                    _logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(),
                            paths.stream().collect(Collectors.joining(";"))));

                    a2_logger.ifPresent(l -> l.log(Level.INFO, true,
                            () -> ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(),
                                    paths.stream().collect(Collectors.joining(";"))),
                            () -> SparkTechnologyService.class.getSimpleName() + "."
                                    + Optional.ofNullable(job.name()).orElse("no_name"),
                            () -> "startAnalyticJobOrTest"));

                    //DEBUG
                    //System.out.println(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";"))));   

                    final Job input_job = Job.getInstance(config);
                    input_job.setInputFormatClass(BeFileInputFormat_Pure.class);
                    paths.stream().forEach(Lambdas
                            .wrap_consumer_u(path -> FileInputFormat.addInputPath(input_job, new Path(path))));
                    // (Add the input config in)
                    input_job.getConfiguration().set(HadoopBatchEnrichmentUtils.BE_BUCKET_INPUT_CONFIG,
                            BeanTemplateUtils.toJson(input_with_test_settings).toString());
                    per_input_action.accept(input_with_test_settings, input_job);
                } else { // not easily available in HDFS directory format, try getting from the context

                    Optional<HadoopBatchEnrichmentUtils.HadoopAccessContext> input_format_info = context
                            .getServiceInput(HadoopBatchEnrichmentUtils.HadoopAccessContext.class,
                                    Optional.empty(), job, input_with_test_settings);
                    if (!input_format_info.isPresent()) {
                        _logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}",
                                BeanTemplateUtils.toJson(input_with_test_settings)));

                        a2_logger.ifPresent(l -> l.log(Level.WARN, true,
                                () -> ErrorUtils.get("Tried but failed to get input format from {0}",
                                        BeanTemplateUtils.toJson(input_with_test_settings)),
                                () -> SparkTechnologyService.class.getSimpleName() + "."
                                        + Optional.ofNullable(job.name()).orElse("no_name"),
                                () -> "startAnalyticJobOrTest"));

                        //DEBUG
                        //System.out.println(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings)));
                    } else {
                        _logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}",
                                bucket.full_name(), input_format_info.get().describe()));

                        a2_logger.ifPresent(l -> l.log(Level.INFO, true,
                                () -> ErrorUtils.get("Adding data service path for bucket {0}: {1}",
                                        bucket.full_name(), input_format_info.get().describe()),
                                () -> SparkTechnologyService.class.getSimpleName() + "."
                                        + Optional.ofNullable(job.name()).orElse("no_name"),
                                () -> "startAnalyticJobOrTest"));

                        //DEBUG
                        //System.out.println(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(),input_format_info.get().describe()));

                        final Job input_job = Job.getInstance(config);
                        input_job.setInputFormatClass(
                                input_format_info.get().getAccessService().either(l -> l.getClass(), r -> r));
                        input_format_info.get().getAccessConfig().ifPresent(map -> {
                            map.entrySet().forEach(kv -> input_job.getConfiguration().set(kv.getKey(),
                                    kv.getValue().toString()));
                        });
                        per_input_action.accept(input_with_test_settings, input_job);
                    }
                }
            }));
}