Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.howbuy.hadoop.mr.online.SecondarySort.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysrot <in> <out>");
        System.exit(2);/*from   www  .j a v a2 s  . co m*/
    }
    Job job = new Job(conf, "secondary sort");
    job.setJarByClass(SecondarySort.class);
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    // group and partition by the first int in the pair
    job.setPartitionerClass(FirstPartitioner.class);
    job.setGroupingComparatorClass(FirstGroupingComparator.class);

    // the map output is IntPair, IntWritable
    job.setMapOutputKeyClass(IntPair.class);
    job.setMapOutputValueClass(IntWritable.class);

    // the reduce output is Text, IntWritable
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setInputFormatClass(TextInputFormat.class);
    // job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setNumReduceTasks(3);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.iflytek.spider.crawl.CrawlDb.java

License:Apache License

public static Job createJob(Configuration config, Path crawlDb) throws IOException {
    Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    Job job = AvroJob.getAvroJob(config);
    job.setJobName("crawldb " + crawlDb);

    Path current = new Path(crawlDb, CURRENT_NAME);
    if (FileSystem.get(config).exists(current)) {
        FileInputFormat.addInputPath(job, current);
    }/*from  ww  w.j  av  a2  s.c  o  m*/
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(CrawlDbFilter.class);
    job.setReducerClass(CrawlDbReducer.class);

    FileOutputFormat.setOutputPath(job, newCrawlDb);
    job.setOutputFormatClass(AvroMapOutputFormat.class);
    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(CrawlDatum.class);

    return job;
}

From source file:com.iflytek.spider.crawl.GeneratorSmart.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation./*from   w w w.j a va  2s .co m*/
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 * @throws ClassNotFoundException
 * @throws InterruptedException
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force)
        throws IOException, InterruptedException, ClassNotFoundException {
    //getConf().set("mapred.temp.dir", "d:/tmp");
    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: starting");

    Job job = AvroJob.getAvroJob(getConf());
    if (numLists == -1) { // for politeness make
        numLists = job.getNumReduceTasks(); // a partition per fetch task
    }
    if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    LOG.info("Generator: with " + numLists + " partition.");
    job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(SelectorMapper.class);
    job.setReducerClass(SelectorReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    //job.setOutputFormatClass(AvroPairOutputFormat.class);
    job.setOutputFormatClass(GeneratorOutputFormat.class);
    job.setOutputKeyClass(Float.class);
    job.setOutputValueClass(SelectorEntry.class);
    // AvroMultipleOutputs.addNamedOutput(job, "seq",
    // AvroPairOutputFormat.class, Float.class, SelectorEntry.class);
    try {
        job.waitForCompletion(true);
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);

            fs.createNewFile(new Path(newSeg, "generatored"));
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        job = AvroJob.getAvroJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormatClass(AvroPairInputFormat.class);
        job.setMapperClass(CrawlDbUpdateMapper.class);
        // job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormatClass(AvroMapOutputFormat.class);
        job.setOutputKeyClass(String.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            job.waitForCompletion(true);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    if (LOG.isInfoEnabled()) {
        LOG.info("Generator: done.");
    }
    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:com.iflytek.spider.crawl.GeneratorSmart.java

License:Apache License

private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir, int numLists)
        throws IOException, InterruptedException, ClassNotFoundException {
    // invert again, partition by host/domain/IP, sort by url hash
    if (LOG.isInfoEnabled()) {
        LOG.info("Generator: Partitioning selected urls for politeness:" + inputDir);
    }/*from w w  w . j a v a 2s.c  om*/
    Path segment = new Path(segmentsDir, generateSegmentName());
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);

    LOG.info("Generator: segment: " + segment + " with " + numLists + " Fetchers");

    Job job = AvroJob.getAvroJob(getConf());
    job.setJobName("generate: partition " + segment);
    job.getConfiguration().setInt("partition.url.seed", new Random().nextInt());

    FileInputFormat.addInputPath(job, inputDir);
    job.setInputFormatClass(AvroPairInputFormat.class);

    job.setMapperClass(SelectorInverseMapper.class);
    job.setPartitionerClass(AveragePartition.class);
    job.setMapOutputKeyClass(String.class);
    job.setMapOutputValueClass(SelectorEntry.class);
    job.setReducerClass(PartitionReducer.class);
    job.setNumReduceTasks(numLists);

    FileOutputFormat.setOutputPath(job, output);
    job.setOutputFormatClass(AvroPairOutputFormat.class);
    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(CrawlDatum.class);

    job.waitForCompletion(true);
    return segment;
}

From source file:com.iflytek.spider.parse.ParseSegment.java

License:Apache License

public void parse(Path segment) throws IOException, InterruptedException, ClassNotFoundException {

    if (LOG.isInfoEnabled()) {
        LOG.info("Parse: starting");
        LOG.info("Parse: segment: " + segment);
    }/*from   ww  w.j  a v a  2  s .  c om*/

    Job job = AvroJob.getAvroJob(getConf());
    job.setJobName("parse " + segment);

    FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
    job.getConfiguration().set(Spider.SEGMENT_NAME_KEY, segment.getName());

    job.setInputFormatClass(AvroPairInputFormat.class);
    job.setMapperClass(ParseMapper.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormatClass(ParseOutputFormat.class);
    job.setOutputKeyClass(String.class);
    job.setOutputValueClass(UnionData.class);

    job.waitForCompletion(true);
    if (LOG.isInfoEnabled()) {
        LOG.info("Parse: done");
    }
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.Aleph2MultiInputFormatBuilder.java

License:Apache License

/** Sets the output configurations in the job 
 * @param job//  ww  w  .j  a v  a 2  s . c o  m
 */
public Job build(final Job job) {

    job.getConfiguration().set(ALEPH2_MULTI_INPUT_FORMAT_JOBS,
            _inputs.keySet().stream().collect(Collectors.joining(",")));
    _inputs.entrySet().stream().forEach(Lambdas.wrap_consumer_u(kv -> {
        try (final Stringifier<Configuration> stringifier = new DefaultStringifier<Configuration>(
                job.getConfiguration(), Configuration.class)) {
            final Configuration new_config = new Configuration(kv.getValue().getConfiguration());
            new_config.set(ALEPH2_MULTI_INPUT_FORMAT_CLAZZ, kv.getValue().getInputFormatClass().getName());
            job.getConfiguration().set(ALEPH2_MULTI_INPUT_FORMAT_PREFIX + kv.getKey(),
                    stringifier.toString(new_config));
        }
    }));
    job.setInputFormatClass(Aleph2MultiInputFormat.class);
    return job;
}

From source file:com.ikanow.aleph2.analytics.hadoop.assets.VerySimpleLocalExample.java

License:Apache License

@SuppressWarnings({ "deprecation", "unchecked", "rawtypes" })
@Test//from w  ww. j  a va 2 s . c o m
public void test_localHadoopLaunch()
        throws IOException, IllegalStateException, ClassNotFoundException, InterruptedException {

    // 0) Setup the temp dir 
    final String temp_dir = System.getProperty("java.io.tmpdir") + File.separator;
    //final Path tmp_path = FileContext.getLocalFSFileContext().makeQualified(new Path(temp_dir));
    final Path tmp_path2 = FileContext.getLocalFSFileContext()
            .makeQualified(new Path(temp_dir + "/tmp_output"));
    try {
        FileContext.getLocalFSFileContext().delete(tmp_path2, true);
    } catch (Exception e) {
    } // (just doesn't exist yet)

    // 1) Setup config with local mode
    final Configuration config = new Configuration();
    config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing)
    config.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem");
    config.set("mapred.job.tracker", "local");
    config.set("fs.defaultFS", "local");
    config.unset("mapreduce.framework.name");

    // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place
    config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");

    // 2) Build job and do more setup using the Job API
    //TODO: not sure why this is deprecated, it doesn't seem to be in v1? We do need to move to JobConf at some point, but I ran into some 
    // issues when trying to do everything I needed to for V1, so seems expedient to start here and migrate away
    final Job hj = new Job(config); // (NOTE: from here, changes to config are ignored)

    // Input format:
    //TOOD: fails because of guava issue, looks like we'll need to move to 2.7 and check it works with 2.5.x server?
    //TextInputFormat.addInputPath(hj, tmp_path);
    //hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName ("org.apache.hadoop.mapreduce.lib.input.TextInputFormat"));
    hj.setInputFormatClass(TestInputFormat.class);

    // Output format:
    hj.setOutputFormatClass((Class<? extends OutputFormat>) Class
            .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat"));
    TextOutputFormat.setOutputPath(hj, tmp_path2);

    // Mapper etc (combiner/reducer are similar)
    hj.setMapperClass(TestMapper.class);
    hj.setOutputKeyClass(Text.class);
    hj.setOutputValueClass(Text.class);
    hj.setNumReduceTasks(0); // (disable reducer for now)

    hj.setJar("test");

    try {
        hj.submit();
    } catch (UnsatisfiedLinkError e) {
        throw new RuntimeException(
                "This is a windows/hadoop compatibility problem - adding the hadoop-commons in the misc_test_assets subdirectory to the top of the classpath should resolve it (and does in V1), though I haven't yet made that work with Aleph2",
                e);
    }
    //hj.getJobID().toString();
    while (!hj.isComplete()) {
        Thread.sleep(1000);
    }
    assertTrue("Finished successfully", hj.isSuccessful());
}

From source file:com.ikanow.aleph2.analytics.hadoop.services.BeJobLauncher.java

License:Open Source License

@Override
public Validation<String, Job> runEnhancementJob(final DataBucketBean bucket,
        final Optional<ProcessingTestSpecBean> testSpec) {

    final Configuration config = getHadoopConfig();

    final ClassLoader currentClassloader = Thread.currentThread().getContextClassLoader();
    //(not currently used, but has proven useful in the past)

    try {//from  w w  w  . j av  a 2s.  c  o m
        final String contextSignature = _batchEnrichmentContext
                .getEnrichmentContextSignature(Optional.of(bucket), Optional.empty());
        config.set(BatchEnrichmentJob.BE_CONTEXT_SIGNATURE, contextSignature);

        final Optional<Long> debug_max = testSpec
                .flatMap(testSpecVals -> Optional.ofNullable(testSpecVals.requested_num_objects()));

        //then gets applied to all the inputs:
        debug_max.ifPresent(val -> config.set(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE, val.toString()));

        final Aleph2MultiInputFormatBuilder inputBuilder = new Aleph2MultiInputFormatBuilder();

        // Create a separate InputFormat for every input (makes testing life easier)

        Optional.ofNullable(_batchEnrichmentContext.getJob().inputs()).orElse(Collections.emptyList()).stream()
                .forEach(Lambdas.wrap_consumer_u(input -> {
                    final List<String> paths = _batchEnrichmentContext.getAnalyticsContext()
                            .getInputPaths(Optional.of(bucket), _batchEnrichmentContext.getJob(), input);
                    final Job inputJob = Job.getInstance(config);
                    inputJob.setInputFormatClass(BeFileInputFormat.class);
                    paths.stream().forEach(Lambdas
                            .wrap_consumer_u(path -> FileInputFormat.addInputPath(inputJob, new Path(path))));
                    inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob);
                }));
        ;

        // (ALEPH-12): other input format types

        // Now do everything else

        final String jobName = BucketUtils.getUniqueSignature(bucket.full_name(),
                Optional.ofNullable(_batchEnrichmentContext.getJob().name()));

        // do not set anything into config past this line (can set job.getConfiguration() elements though - that is what the builder does)
        Job job = Job.getInstance(config, jobName);
        job.setJarByClass(BatchEnrichmentJob.class);

        // Set the classpath

        cacheJars(job, bucket, _batchEnrichmentContext.getAnalyticsContext());

        // (generic mapper - the actual code is run using the classes in the shared libraries)
        job.setMapperClass(BatchEnrichmentJob.BatchEnrichmentMapper.class);

        //TODO: ALEPH-12 handle reducer scenarios
        job.setNumReduceTasks(0);
        //job.setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class);

        // Input format:
        inputBuilder.build(job);

        // Output format (doesn't really do anything, all the actual output code is performed by the mapper via the enrichment context)
        job.setOutputFormatClass(BeFileOutputFormat.class);

        launch(job);
        return Validation.success(job);

    } catch (Throwable t) {
        logger.error("Caught Exception", t);
        return Validation.fail(ErrorUtils.getLongForm("{0}", t));
    } finally {
        Thread.currentThread().setContextClassLoader(currentClassloader);
    }

}

From source file:com.ikanow.aleph2.analytics.r.services.BeJobLauncher.java

License:Apache License

@Override
public Validation<String, Job> runEnhancementJob(final DataBucketBean bucket,
        final Optional<ProcessingTestSpecBean> testSpec) {

    final Configuration config = getHadoopConfig();

    final ClassLoader currentClassloader = Thread.currentThread().getContextClassLoader();
    //(not currently used, but has proven useful in the past)

    final SetOnce<Job> job = new SetOnce<>();
    try {/*from w ww. j  av  a 2  s  .  co m*/
        final Optional<Long> debug_max = testSpec
                .flatMap(testSpecVals -> Optional.ofNullable(testSpecVals.requested_num_objects()));

        //then gets applied to all the inputs:
        debug_max.ifPresent(val -> config.set(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE, val.toString()));

        final Aleph2MultiInputFormatBuilder inputBuilder = new Aleph2MultiInputFormatBuilder();

        // Validation:

        try {
            final BatchEnrichmentJob.BatchEnrichmentBaseValidator validator = new BatchEnrichmentJob.BatchEnrichmentBaseValidator();
            validator.setDataBucket(bucket);
            validator.setEnrichmentContext(_batchEnrichmentContext);
            validator.setEcMetadata(
                    Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList()));
            final List<BasicMessageBean> errs = validator.validate();
            if (errs.stream().anyMatch(b -> !b.success())) {
                return Validation.fail(ErrorUtils.get("Validation errors for {0}: {1}", bucket.full_name(),
                        errs.stream().map(
                                b -> ErrorUtils.get("{0}: {1}", b.success() ? "INFO" : "ERROR", b.message()))
                                .collect(Collectors.joining(";"))));
            }
        } catch (Throwable t) { // we'll log but carry on in this case...(in case there's some classloading shenanigans which won't affect the operation in hadoop)
            logger.error(
                    ErrorUtils.getLongForm("Failed validation, bucket: {1} error: {0}", t, bucket.full_name()));
        }

        // Create a separate InputFormat for every input (makes testing life easier)

        Optional.ofNullable(_batchEnrichmentContext.getJob().inputs()).orElse(Collections.emptyList()).stream()
                .filter(input -> Optional.ofNullable(input.enabled()).orElse(true))
                .forEach(Lambdas.wrap_consumer_u(input -> {
                    // In the debug case, transform the input to add the max record limit
                    final AnalyticThreadJobInputBean input_with_test_settings = BeanTemplateUtils.clone(input)
                            .with(AnalyticThreadJobInputBean::config, BeanTemplateUtils
                                    .clone(Optional.ofNullable(input.config()).orElseGet(() -> BeanTemplateUtils
                                            .build(AnalyticThreadJobInputConfigBean.class).done().get()))
                                    .with(AnalyticThreadJobInputConfigBean::test_record_limit_request, //(if not test, always null; else "input override" or "output default")
                                            debug_max.map(max -> Optionals
                                                    .of(() -> input.config().test_record_limit_request())
                                                    .orElse(max)).orElse(null))
                                    .done())
                            .done();

                    // Get the paths and add them to a list for later
                    final List<String> paths = _batchEnrichmentContext.getAnalyticsContext().getInputPaths(
                            Optional.of(bucket), _batchEnrichmentContext.getJob(), input_with_test_settings);

                    RScriptUtils.addFilePaths(paths);

                    if (!paths.isEmpty()) {

                        logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}",
                                bucket.full_name(), paths.stream().collect(Collectors.joining(";"))));

                        final Job inputJob = Job.getInstance(config);
                        inputJob.setInputFormatClass(BeFileInputFormat.class);
                        paths.stream().forEach(Lambdas.wrap_consumer_u(
                                path -> FileInputFormat.addInputPath(inputJob, new Path(path))));
                        inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob);
                    } else { // not easily available in HDFS directory format, try getting from the context

                        Optional<HadoopAccessContext> input_format_info = _batchEnrichmentContext
                                .getAnalyticsContext().getServiceInput(HadoopAccessContext.class,
                                        Optional.of(bucket), _batchEnrichmentContext.getJob(),
                                        input_with_test_settings);
                        if (!input_format_info.isPresent()) {
                            logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}",
                                    BeanTemplateUtils.toJson(input_with_test_settings)));
                        } else {
                            logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}",
                                    bucket.full_name(), input_format_info.get().describe()));

                            final Job inputJob = Job.getInstance(config);
                            inputJob.setInputFormatClass(input_format_info.get().getAccessService()
                                    .either(l -> l.getClass(), r -> r));
                            input_format_info.get().getAccessConfig().ifPresent(map -> {
                                map.entrySet().forEach(kv -> inputJob.getConfiguration().set(kv.getKey(),
                                        kv.getValue().toString()));
                            });

                            inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob);
                        }
                    }

                }));

        // (ALEPH-12): other input format types

        // Now do everything else

        final String contextSignature = _batchEnrichmentContext
                .getEnrichmentContextSignature(Optional.of(bucket), Optional.empty());
        config.set(BatchEnrichmentJob.BE_CONTEXT_SIGNATURE, contextSignature);

        final String jobName = BucketUtils.getUniqueSignature(bucket.full_name(),
                Optional.ofNullable(_batchEnrichmentContext.getJob().name()));

        this.handleHadoopConfigOverrides(bucket, config);

        // do not set anything into config past this line (can set job.getConfiguration() elements though - that is what the builder does)
        job.set(Job.getInstance(config, jobName));
        job.get().setJarByClass(BatchEnrichmentJob.class);
        job.get().setSortComparatorClass(ObjectNodeWritableComparable.Comparator.class); //(avoid deser of json node for intermediate things)

        // Set the classpath

        cacheJars(job.get(), bucket, _batchEnrichmentContext.getAnalyticsContext());

        // (generic mapper - the actual code is run using the classes in the shared libraries)
        job.get().setMapperClass(BatchEnrichmentJob.BatchEnrichmentMapper.class);
        job.get().setMapOutputKeyClass(ObjectNodeWritableComparable.class);
        job.get().setMapOutputValueClass(ObjectNodeWritableComparable.class);

        // (combiner and reducer)
        Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList()).stream()
                .filter(cfg -> Optional.ofNullable(cfg.enabled()).orElse(true))
                .filter(cfg -> !Optionals.ofNullable(cfg.grouping_fields()).isEmpty()).findAny().map(cfg -> {
                    final HadoopTechnologyOverrideBean tech_override = BeanTemplateUtils
                            .from(Optional.ofNullable(cfg.technology_override()).orElse(Collections.emptyMap()),
                                    HadoopTechnologyOverrideBean.class)
                            .get();

                    job.get().setNumReduceTasks(Optional.ofNullable(tech_override.num_reducers()).orElse(2));
                    job.get().setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class);

                    if (tech_override.use_combiner()) {
                        job.get().setCombinerClass(BatchEnrichmentJob.BatchEnrichmentCombiner.class);
                    }
                    return Unit.unit();
                }).orElseGet(() -> {
                    job.get().setNumReduceTasks(0);
                    return Unit.unit();
                });

        // job.setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class);

        // Input format:
        inputBuilder.build(job.get());

        // Output format (doesn't really do anything, all the actual output code is performed by the mapper via the enrichment context)
        job.get().setOutputFormatClass(BeFileOutputFormat.class);

        // Submit the job for processing
        launch(job.get());

        // Wait for the job to complete and collect the data
        //            job.get().waitForCompletion(true);

        return Validation.success(job.get());

    } catch (Throwable t) {
        Throwable tt = (t instanceof RuntimeException) ? (null != t.getCause()) ? t.getCause() : t : t;

        if (tt instanceof org.apache.hadoop.mapreduce.lib.input.InvalidInputException) {
            // Probably a benign "no matching paths", so return pithy error
            return Validation.fail(ErrorUtils.get("{0}", tt.getMessage()));
        } else { // General error : Dump the config params to string         
            if (job.isSet()) {
                logger.error(ErrorUtils.get("Error submitting, config= {0}",
                        Optionals.streamOf(job.get().getConfiguration().iterator(), false)
                                .map(kv -> kv.getKey() + ":" + kv.getValue())
                                .collect(Collectors.joining("; "))));
            }
            return Validation.fail(ErrorUtils.getLongForm("{0}", tt));
        }
    } finally {
        Thread.currentThread().setContextClassLoader(currentClassloader);
    }

}

From source file:com.ikanow.aleph2.analytics.spark.utils.SparkTechnologyUtils.java

License:Apache License

/** Builds objects for all the aleph2 inputs and provides a method to use them in context-dependent ways 
 * @param context//from   www. j a  v a2 s  .c o m
 * @param bucket
 * @param job
 * @param config
 * @param per_input_action - user lambda that determines how they are used
 */
public static final void buildAleph2Inputs(final IAnalyticsContext context, final DataBucketBean bucket,
        final AnalyticThreadJobBean job, final Optional<ProcessingTestSpecBean> maybe_test_spec,
        final Configuration config, final Set<String> exclude_names,
        BiConsumer<AnalyticThreadJobInputBean, Job> per_input_action) {
    transformInputBean(Optionals.ofNullable(job.inputs()).stream(), maybe_test_spec)
            .filter(input -> !exclude_names.contains(input.name()))
            .forEach(Lambdas.wrap_consumer_u(input_with_test_settings -> {

                final Optional<IBucketLogger> a2_logger = Optional
                        .ofNullable(context.getLogger(Optional.of(bucket)));

                final List<String> paths = context.getInputPaths(Optional.empty(), job,
                        input_with_test_settings);

                if (!paths.isEmpty()) {

                    _logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(),
                            paths.stream().collect(Collectors.joining(";"))));

                    a2_logger.ifPresent(l -> l.log(Level.INFO, true,
                            () -> ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(),
                                    paths.stream().collect(Collectors.joining(";"))),
                            () -> SparkTechnologyService.class.getSimpleName() + "."
                                    + Optional.ofNullable(job.name()).orElse("no_name"),
                            () -> "startAnalyticJobOrTest"));

                    //DEBUG
                    //System.out.println(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";"))));   

                    final Job input_job = Job.getInstance(config);
                    input_job.setInputFormatClass(BeFileInputFormat_Pure.class);
                    paths.stream().forEach(Lambdas
                            .wrap_consumer_u(path -> FileInputFormat.addInputPath(input_job, new Path(path))));
                    // (Add the input config in)
                    input_job.getConfiguration().set(HadoopBatchEnrichmentUtils.BE_BUCKET_INPUT_CONFIG,
                            BeanTemplateUtils.toJson(input_with_test_settings).toString());
                    per_input_action.accept(input_with_test_settings, input_job);
                } else { // not easily available in HDFS directory format, try getting from the context

                    Optional<HadoopBatchEnrichmentUtils.HadoopAccessContext> input_format_info = context
                            .getServiceInput(HadoopBatchEnrichmentUtils.HadoopAccessContext.class,
                                    Optional.empty(), job, input_with_test_settings);
                    if (!input_format_info.isPresent()) {
                        _logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}",
                                BeanTemplateUtils.toJson(input_with_test_settings)));

                        a2_logger.ifPresent(l -> l.log(Level.WARN, true,
                                () -> ErrorUtils.get("Tried but failed to get input format from {0}",
                                        BeanTemplateUtils.toJson(input_with_test_settings)),
                                () -> SparkTechnologyService.class.getSimpleName() + "."
                                        + Optional.ofNullable(job.name()).orElse("no_name"),
                                () -> "startAnalyticJobOrTest"));

                        //DEBUG
                        //System.out.println(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings)));
                    } else {
                        _logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}",
                                bucket.full_name(), input_format_info.get().describe()));

                        a2_logger.ifPresent(l -> l.log(Level.INFO, true,
                                () -> ErrorUtils.get("Adding data service path for bucket {0}: {1}",
                                        bucket.full_name(), input_format_info.get().describe()),
                                () -> SparkTechnologyService.class.getSimpleName() + "."
                                        + Optional.ofNullable(job.name()).orElse("no_name"),
                                () -> "startAnalyticJobOrTest"));

                        //DEBUG
                        //System.out.println(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(),input_format_info.get().describe()));

                        final Job input_job = Job.getInstance(config);
                        input_job.setInputFormatClass(
                                input_format_info.get().getAccessService().either(l -> l.getClass(), r -> r));
                        input_format_info.get().getAccessConfig().ifPresent(map -> {
                            map.entrySet().forEach(kv -> input_job.getConfiguration().set(kv.getKey(),
                                    kv.getValue().toString()));
                        });
                        per_input_action.accept(input_with_test_settings, input_job);
                    }
                }
            }));
}