List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass
public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException
From source file:com.howbuy.hadoop.mr.online.SecondarySort.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: secondarysrot <in> <out>"); System.exit(2);/*from www .j a v a2 s . co m*/ } Job job = new Job(conf, "secondary sort"); job.setJarByClass(SecondarySort.class); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); // group and partition by the first int in the pair job.setPartitionerClass(FirstPartitioner.class); job.setGroupingComparatorClass(FirstGroupingComparator.class); // the map output is IntPair, IntWritable job.setMapOutputKeyClass(IntPair.class); job.setMapOutputValueClass(IntWritable.class); // the reduce output is Text, IntWritable job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(TextInputFormat.class); // job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(3); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.iflytek.spider.crawl.CrawlDb.java
License:Apache License
public static Job createJob(Configuration config, Path crawlDb) throws IOException { Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Job job = AvroJob.getAvroJob(config); job.setJobName("crawldb " + crawlDb); Path current = new Path(crawlDb, CURRENT_NAME); if (FileSystem.get(config).exists(current)) { FileInputFormat.addInputPath(job, current); }/*from ww w.j av a2 s.c o m*/ job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(CrawlDbFilter.class); job.setReducerClass(CrawlDbReducer.class); FileOutputFormat.setOutputPath(job, newCrawlDb); job.setOutputFormatClass(AvroMapOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); return job; }
From source file:com.iflytek.spider.crawl.GeneratorSmart.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation./*from w w w.j a va 2s .co m*/ * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs * @throws ClassNotFoundException * @throws InterruptedException */ public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force) throws IOException, InterruptedException, ClassNotFoundException { //getConf().set("mapred.temp.dir", "d:/tmp"); Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: starting"); Job job = AvroJob.getAvroJob(getConf()); if (numLists == -1) { // for politeness make numLists = job.getNumReduceTasks(); // a partition per fetch task } if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } LOG.info("Generator: with " + numLists + " partition."); job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(SelectorMapper.class); job.setReducerClass(SelectorReducer.class); FileOutputFormat.setOutputPath(job, tempDir); //job.setOutputFormatClass(AvroPairOutputFormat.class); job.setOutputFormatClass(GeneratorOutputFormat.class); job.setOutputKeyClass(Float.class); job.setOutputValueClass(SelectorEntry.class); // AvroMultipleOutputs.addNamedOutput(job, "seq", // AvroPairOutputFormat.class, Float.class, SelectorEntry.class); try { job.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); return null; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); fs.createNewFile(new Path(newSeg, "generatored")); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = AvroJob.getAvroJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(CrawlDbUpdateMapper.class); // job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormatClass(AvroMapOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { job.waitForCompletion(true); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); if (LOG.isInfoEnabled()) { LOG.info("Generator: done."); } Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.iflytek.spider.crawl.GeneratorSmart.java
License:Apache License
private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir, int numLists) throws IOException, InterruptedException, ClassNotFoundException { // invert again, partition by host/domain/IP, sort by url hash if (LOG.isInfoEnabled()) { LOG.info("Generator: Partitioning selected urls for politeness:" + inputDir); }/*from w w w . j a v a 2s.c om*/ Path segment = new Path(segmentsDir, generateSegmentName()); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); LOG.info("Generator: segment: " + segment + " with " + numLists + " Fetchers"); Job job = AvroJob.getAvroJob(getConf()); job.setJobName("generate: partition " + segment); job.getConfiguration().setInt("partition.url.seed", new Random().nextInt()); FileInputFormat.addInputPath(job, inputDir); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(SelectorInverseMapper.class); job.setPartitionerClass(AveragePartition.class); job.setMapOutputKeyClass(String.class); job.setMapOutputValueClass(SelectorEntry.class); job.setReducerClass(PartitionReducer.class); job.setNumReduceTasks(numLists); FileOutputFormat.setOutputPath(job, output); job.setOutputFormatClass(AvroPairOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); job.waitForCompletion(true); return segment; }
From source file:com.iflytek.spider.parse.ParseSegment.java
License:Apache License
public void parse(Path segment) throws IOException, InterruptedException, ClassNotFoundException { if (LOG.isInfoEnabled()) { LOG.info("Parse: starting"); LOG.info("Parse: segment: " + segment); }/*from ww w.j a v a 2 s . c om*/ Job job = AvroJob.getAvroJob(getConf()); job.setJobName("parse " + segment); FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME)); job.getConfiguration().set(Spider.SEGMENT_NAME_KEY, segment.getName()); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(ParseMapper.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormatClass(ParseOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(UnionData.class); job.waitForCompletion(true); if (LOG.isInfoEnabled()) { LOG.info("Parse: done"); } }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.Aleph2MultiInputFormatBuilder.java
License:Apache License
/** Sets the output configurations in the job * @param job// ww w .j a v a 2 s . c o m */ public Job build(final Job job) { job.getConfiguration().set(ALEPH2_MULTI_INPUT_FORMAT_JOBS, _inputs.keySet().stream().collect(Collectors.joining(","))); _inputs.entrySet().stream().forEach(Lambdas.wrap_consumer_u(kv -> { try (final Stringifier<Configuration> stringifier = new DefaultStringifier<Configuration>( job.getConfiguration(), Configuration.class)) { final Configuration new_config = new Configuration(kv.getValue().getConfiguration()); new_config.set(ALEPH2_MULTI_INPUT_FORMAT_CLAZZ, kv.getValue().getInputFormatClass().getName()); job.getConfiguration().set(ALEPH2_MULTI_INPUT_FORMAT_PREFIX + kv.getKey(), stringifier.toString(new_config)); } })); job.setInputFormatClass(Aleph2MultiInputFormat.class); return job; }
From source file:com.ikanow.aleph2.analytics.hadoop.assets.VerySimpleLocalExample.java
License:Apache License
@SuppressWarnings({ "deprecation", "unchecked", "rawtypes" }) @Test//from w ww. j a va 2 s . c o m public void test_localHadoopLaunch() throws IOException, IllegalStateException, ClassNotFoundException, InterruptedException { // 0) Setup the temp dir final String temp_dir = System.getProperty("java.io.tmpdir") + File.separator; //final Path tmp_path = FileContext.getLocalFSFileContext().makeQualified(new Path(temp_dir)); final Path tmp_path2 = FileContext.getLocalFSFileContext() .makeQualified(new Path(temp_dir + "/tmp_output")); try { FileContext.getLocalFSFileContext().delete(tmp_path2, true); } catch (Exception e) { } // (just doesn't exist yet) // 1) Setup config with local mode final Configuration config = new Configuration(); config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing) config.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); config.set("mapred.job.tracker", "local"); config.set("fs.defaultFS", "local"); config.unset("mapreduce.framework.name"); // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); // 2) Build job and do more setup using the Job API //TODO: not sure why this is deprecated, it doesn't seem to be in v1? We do need to move to JobConf at some point, but I ran into some // issues when trying to do everything I needed to for V1, so seems expedient to start here and migrate away final Job hj = new Job(config); // (NOTE: from here, changes to config are ignored) // Input format: //TOOD: fails because of guava issue, looks like we'll need to move to 2.7 and check it works with 2.5.x server? //TextInputFormat.addInputPath(hj, tmp_path); //hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName ("org.apache.hadoop.mapreduce.lib.input.TextInputFormat")); hj.setInputFormatClass(TestInputFormat.class); // Output format: hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat")); TextOutputFormat.setOutputPath(hj, tmp_path2); // Mapper etc (combiner/reducer are similar) hj.setMapperClass(TestMapper.class); hj.setOutputKeyClass(Text.class); hj.setOutputValueClass(Text.class); hj.setNumReduceTasks(0); // (disable reducer for now) hj.setJar("test"); try { hj.submit(); } catch (UnsatisfiedLinkError e) { throw new RuntimeException( "This is a windows/hadoop compatibility problem - adding the hadoop-commons in the misc_test_assets subdirectory to the top of the classpath should resolve it (and does in V1), though I haven't yet made that work with Aleph2", e); } //hj.getJobID().toString(); while (!hj.isComplete()) { Thread.sleep(1000); } assertTrue("Finished successfully", hj.isSuccessful()); }
From source file:com.ikanow.aleph2.analytics.hadoop.services.BeJobLauncher.java
License:Open Source License
@Override public Validation<String, Job> runEnhancementJob(final DataBucketBean bucket, final Optional<ProcessingTestSpecBean> testSpec) { final Configuration config = getHadoopConfig(); final ClassLoader currentClassloader = Thread.currentThread().getContextClassLoader(); //(not currently used, but has proven useful in the past) try {//from w w w . j av a 2s. c o m final String contextSignature = _batchEnrichmentContext .getEnrichmentContextSignature(Optional.of(bucket), Optional.empty()); config.set(BatchEnrichmentJob.BE_CONTEXT_SIGNATURE, contextSignature); final Optional<Long> debug_max = testSpec .flatMap(testSpecVals -> Optional.ofNullable(testSpecVals.requested_num_objects())); //then gets applied to all the inputs: debug_max.ifPresent(val -> config.set(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE, val.toString())); final Aleph2MultiInputFormatBuilder inputBuilder = new Aleph2MultiInputFormatBuilder(); // Create a separate InputFormat for every input (makes testing life easier) Optional.ofNullable(_batchEnrichmentContext.getJob().inputs()).orElse(Collections.emptyList()).stream() .forEach(Lambdas.wrap_consumer_u(input -> { final List<String> paths = _batchEnrichmentContext.getAnalyticsContext() .getInputPaths(Optional.of(bucket), _batchEnrichmentContext.getJob(), input); final Job inputJob = Job.getInstance(config); inputJob.setInputFormatClass(BeFileInputFormat.class); paths.stream().forEach(Lambdas .wrap_consumer_u(path -> FileInputFormat.addInputPath(inputJob, new Path(path)))); inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob); })); ; // (ALEPH-12): other input format types // Now do everything else final String jobName = BucketUtils.getUniqueSignature(bucket.full_name(), Optional.ofNullable(_batchEnrichmentContext.getJob().name())); // do not set anything into config past this line (can set job.getConfiguration() elements though - that is what the builder does) Job job = Job.getInstance(config, jobName); job.setJarByClass(BatchEnrichmentJob.class); // Set the classpath cacheJars(job, bucket, _batchEnrichmentContext.getAnalyticsContext()); // (generic mapper - the actual code is run using the classes in the shared libraries) job.setMapperClass(BatchEnrichmentJob.BatchEnrichmentMapper.class); //TODO: ALEPH-12 handle reducer scenarios job.setNumReduceTasks(0); //job.setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class); // Input format: inputBuilder.build(job); // Output format (doesn't really do anything, all the actual output code is performed by the mapper via the enrichment context) job.setOutputFormatClass(BeFileOutputFormat.class); launch(job); return Validation.success(job); } catch (Throwable t) { logger.error("Caught Exception", t); return Validation.fail(ErrorUtils.getLongForm("{0}", t)); } finally { Thread.currentThread().setContextClassLoader(currentClassloader); } }
From source file:com.ikanow.aleph2.analytics.r.services.BeJobLauncher.java
License:Apache License
@Override public Validation<String, Job> runEnhancementJob(final DataBucketBean bucket, final Optional<ProcessingTestSpecBean> testSpec) { final Configuration config = getHadoopConfig(); final ClassLoader currentClassloader = Thread.currentThread().getContextClassLoader(); //(not currently used, but has proven useful in the past) final SetOnce<Job> job = new SetOnce<>(); try {/*from w ww. j av a 2 s . co m*/ final Optional<Long> debug_max = testSpec .flatMap(testSpecVals -> Optional.ofNullable(testSpecVals.requested_num_objects())); //then gets applied to all the inputs: debug_max.ifPresent(val -> config.set(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE, val.toString())); final Aleph2MultiInputFormatBuilder inputBuilder = new Aleph2MultiInputFormatBuilder(); // Validation: try { final BatchEnrichmentJob.BatchEnrichmentBaseValidator validator = new BatchEnrichmentJob.BatchEnrichmentBaseValidator(); validator.setDataBucket(bucket); validator.setEnrichmentContext(_batchEnrichmentContext); validator.setEcMetadata( Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList())); final List<BasicMessageBean> errs = validator.validate(); if (errs.stream().anyMatch(b -> !b.success())) { return Validation.fail(ErrorUtils.get("Validation errors for {0}: {1}", bucket.full_name(), errs.stream().map( b -> ErrorUtils.get("{0}: {1}", b.success() ? "INFO" : "ERROR", b.message())) .collect(Collectors.joining(";")))); } } catch (Throwable t) { // we'll log but carry on in this case...(in case there's some classloading shenanigans which won't affect the operation in hadoop) logger.error( ErrorUtils.getLongForm("Failed validation, bucket: {1} error: {0}", t, bucket.full_name())); } // Create a separate InputFormat for every input (makes testing life easier) Optional.ofNullable(_batchEnrichmentContext.getJob().inputs()).orElse(Collections.emptyList()).stream() .filter(input -> Optional.ofNullable(input.enabled()).orElse(true)) .forEach(Lambdas.wrap_consumer_u(input -> { // In the debug case, transform the input to add the max record limit final AnalyticThreadJobInputBean input_with_test_settings = BeanTemplateUtils.clone(input) .with(AnalyticThreadJobInputBean::config, BeanTemplateUtils .clone(Optional.ofNullable(input.config()).orElseGet(() -> BeanTemplateUtils .build(AnalyticThreadJobInputConfigBean.class).done().get())) .with(AnalyticThreadJobInputConfigBean::test_record_limit_request, //(if not test, always null; else "input override" or "output default") debug_max.map(max -> Optionals .of(() -> input.config().test_record_limit_request()) .orElse(max)).orElse(null)) .done()) .done(); // Get the paths and add them to a list for later final List<String> paths = _batchEnrichmentContext.getAnalyticsContext().getInputPaths( Optional.of(bucket), _batchEnrichmentContext.getJob(), input_with_test_settings); RScriptUtils.addFilePaths(paths); if (!paths.isEmpty()) { logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";")))); final Job inputJob = Job.getInstance(config); inputJob.setInputFormatClass(BeFileInputFormat.class); paths.stream().forEach(Lambdas.wrap_consumer_u( path -> FileInputFormat.addInputPath(inputJob, new Path(path)))); inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob); } else { // not easily available in HDFS directory format, try getting from the context Optional<HadoopAccessContext> input_format_info = _batchEnrichmentContext .getAnalyticsContext().getServiceInput(HadoopAccessContext.class, Optional.of(bucket), _batchEnrichmentContext.getJob(), input_with_test_settings); if (!input_format_info.isPresent()) { logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings))); } else { logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(), input_format_info.get().describe())); final Job inputJob = Job.getInstance(config); inputJob.setInputFormatClass(input_format_info.get().getAccessService() .either(l -> l.getClass(), r -> r)); input_format_info.get().getAccessConfig().ifPresent(map -> { map.entrySet().forEach(kv -> inputJob.getConfiguration().set(kv.getKey(), kv.getValue().toString())); }); inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob); } } })); // (ALEPH-12): other input format types // Now do everything else final String contextSignature = _batchEnrichmentContext .getEnrichmentContextSignature(Optional.of(bucket), Optional.empty()); config.set(BatchEnrichmentJob.BE_CONTEXT_SIGNATURE, contextSignature); final String jobName = BucketUtils.getUniqueSignature(bucket.full_name(), Optional.ofNullable(_batchEnrichmentContext.getJob().name())); this.handleHadoopConfigOverrides(bucket, config); // do not set anything into config past this line (can set job.getConfiguration() elements though - that is what the builder does) job.set(Job.getInstance(config, jobName)); job.get().setJarByClass(BatchEnrichmentJob.class); job.get().setSortComparatorClass(ObjectNodeWritableComparable.Comparator.class); //(avoid deser of json node for intermediate things) // Set the classpath cacheJars(job.get(), bucket, _batchEnrichmentContext.getAnalyticsContext()); // (generic mapper - the actual code is run using the classes in the shared libraries) job.get().setMapperClass(BatchEnrichmentJob.BatchEnrichmentMapper.class); job.get().setMapOutputKeyClass(ObjectNodeWritableComparable.class); job.get().setMapOutputValueClass(ObjectNodeWritableComparable.class); // (combiner and reducer) Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList()).stream() .filter(cfg -> Optional.ofNullable(cfg.enabled()).orElse(true)) .filter(cfg -> !Optionals.ofNullable(cfg.grouping_fields()).isEmpty()).findAny().map(cfg -> { final HadoopTechnologyOverrideBean tech_override = BeanTemplateUtils .from(Optional.ofNullable(cfg.technology_override()).orElse(Collections.emptyMap()), HadoopTechnologyOverrideBean.class) .get(); job.get().setNumReduceTasks(Optional.ofNullable(tech_override.num_reducers()).orElse(2)); job.get().setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class); if (tech_override.use_combiner()) { job.get().setCombinerClass(BatchEnrichmentJob.BatchEnrichmentCombiner.class); } return Unit.unit(); }).orElseGet(() -> { job.get().setNumReduceTasks(0); return Unit.unit(); }); // job.setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class); // Input format: inputBuilder.build(job.get()); // Output format (doesn't really do anything, all the actual output code is performed by the mapper via the enrichment context) job.get().setOutputFormatClass(BeFileOutputFormat.class); // Submit the job for processing launch(job.get()); // Wait for the job to complete and collect the data // job.get().waitForCompletion(true); return Validation.success(job.get()); } catch (Throwable t) { Throwable tt = (t instanceof RuntimeException) ? (null != t.getCause()) ? t.getCause() : t : t; if (tt instanceof org.apache.hadoop.mapreduce.lib.input.InvalidInputException) { // Probably a benign "no matching paths", so return pithy error return Validation.fail(ErrorUtils.get("{0}", tt.getMessage())); } else { // General error : Dump the config params to string if (job.isSet()) { logger.error(ErrorUtils.get("Error submitting, config= {0}", Optionals.streamOf(job.get().getConfiguration().iterator(), false) .map(kv -> kv.getKey() + ":" + kv.getValue()) .collect(Collectors.joining("; ")))); } return Validation.fail(ErrorUtils.getLongForm("{0}", tt)); } } finally { Thread.currentThread().setContextClassLoader(currentClassloader); } }
From source file:com.ikanow.aleph2.analytics.spark.utils.SparkTechnologyUtils.java
License:Apache License
/** Builds objects for all the aleph2 inputs and provides a method to use them in context-dependent ways * @param context//from www. j a v a2 s .c o m * @param bucket * @param job * @param config * @param per_input_action - user lambda that determines how they are used */ public static final void buildAleph2Inputs(final IAnalyticsContext context, final DataBucketBean bucket, final AnalyticThreadJobBean job, final Optional<ProcessingTestSpecBean> maybe_test_spec, final Configuration config, final Set<String> exclude_names, BiConsumer<AnalyticThreadJobInputBean, Job> per_input_action) { transformInputBean(Optionals.ofNullable(job.inputs()).stream(), maybe_test_spec) .filter(input -> !exclude_names.contains(input.name())) .forEach(Lambdas.wrap_consumer_u(input_with_test_settings -> { final Optional<IBucketLogger> a2_logger = Optional .ofNullable(context.getLogger(Optional.of(bucket))); final List<String> paths = context.getInputPaths(Optional.empty(), job, input_with_test_settings); if (!paths.isEmpty()) { _logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";")))); a2_logger.ifPresent(l -> l.log(Level.INFO, true, () -> ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";"))), () -> SparkTechnologyService.class.getSimpleName() + "." + Optional.ofNullable(job.name()).orElse("no_name"), () -> "startAnalyticJobOrTest")); //DEBUG //System.out.println(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";")))); final Job input_job = Job.getInstance(config); input_job.setInputFormatClass(BeFileInputFormat_Pure.class); paths.stream().forEach(Lambdas .wrap_consumer_u(path -> FileInputFormat.addInputPath(input_job, new Path(path)))); // (Add the input config in) input_job.getConfiguration().set(HadoopBatchEnrichmentUtils.BE_BUCKET_INPUT_CONFIG, BeanTemplateUtils.toJson(input_with_test_settings).toString()); per_input_action.accept(input_with_test_settings, input_job); } else { // not easily available in HDFS directory format, try getting from the context Optional<HadoopBatchEnrichmentUtils.HadoopAccessContext> input_format_info = context .getServiceInput(HadoopBatchEnrichmentUtils.HadoopAccessContext.class, Optional.empty(), job, input_with_test_settings); if (!input_format_info.isPresent()) { _logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings))); a2_logger.ifPresent(l -> l.log(Level.WARN, true, () -> ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings)), () -> SparkTechnologyService.class.getSimpleName() + "." + Optional.ofNullable(job.name()).orElse("no_name"), () -> "startAnalyticJobOrTest")); //DEBUG //System.out.println(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings))); } else { _logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(), input_format_info.get().describe())); a2_logger.ifPresent(l -> l.log(Level.INFO, true, () -> ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(), input_format_info.get().describe()), () -> SparkTechnologyService.class.getSimpleName() + "." + Optional.ofNullable(job.name()).orElse("no_name"), () -> "startAnalyticJobOrTest")); //DEBUG //System.out.println(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(),input_format_info.get().describe())); final Job input_job = Job.getInstance(config); input_job.setInputFormatClass( input_format_info.get().getAccessService().either(l -> l.getClass(), r -> r)); input_format_info.get().getAccessConfig().ifPresent(map -> { map.entrySet().forEach(kv -> input_job.getConfiguration().set(kv.getKey(), kv.getValue().toString())); }); per_input_action.accept(input_with_test_settings, input_job); } } })); }