List of usage examples for org.apache.hadoop.mapreduce Job getConfiguration
public Configuration getConfiguration()
From source file:com.ikanow.aleph2.analytics.r.services.BeJobLauncher.java
License:Apache License
@Override public Validation<String, Job> runEnhancementJob(final DataBucketBean bucket, final Optional<ProcessingTestSpecBean> testSpec) { final Configuration config = getHadoopConfig(); final ClassLoader currentClassloader = Thread.currentThread().getContextClassLoader(); //(not currently used, but has proven useful in the past) final SetOnce<Job> job = new SetOnce<>(); try {// ww w.j a v a2s .co m final Optional<Long> debug_max = testSpec .flatMap(testSpecVals -> Optional.ofNullable(testSpecVals.requested_num_objects())); //then gets applied to all the inputs: debug_max.ifPresent(val -> config.set(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE, val.toString())); final Aleph2MultiInputFormatBuilder inputBuilder = new Aleph2MultiInputFormatBuilder(); // Validation: try { final BatchEnrichmentJob.BatchEnrichmentBaseValidator validator = new BatchEnrichmentJob.BatchEnrichmentBaseValidator(); validator.setDataBucket(bucket); validator.setEnrichmentContext(_batchEnrichmentContext); validator.setEcMetadata( Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList())); final List<BasicMessageBean> errs = validator.validate(); if (errs.stream().anyMatch(b -> !b.success())) { return Validation.fail(ErrorUtils.get("Validation errors for {0}: {1}", bucket.full_name(), errs.stream().map( b -> ErrorUtils.get("{0}: {1}", b.success() ? "INFO" : "ERROR", b.message())) .collect(Collectors.joining(";")))); } } catch (Throwable t) { // we'll log but carry on in this case...(in case there's some classloading shenanigans which won't affect the operation in hadoop) logger.error( ErrorUtils.getLongForm("Failed validation, bucket: {1} error: {0}", t, bucket.full_name())); } // Create a separate InputFormat for every input (makes testing life easier) Optional.ofNullable(_batchEnrichmentContext.getJob().inputs()).orElse(Collections.emptyList()).stream() .filter(input -> Optional.ofNullable(input.enabled()).orElse(true)) .forEach(Lambdas.wrap_consumer_u(input -> { // In the debug case, transform the input to add the max record limit final AnalyticThreadJobInputBean input_with_test_settings = BeanTemplateUtils.clone(input) .with(AnalyticThreadJobInputBean::config, BeanTemplateUtils .clone(Optional.ofNullable(input.config()).orElseGet(() -> BeanTemplateUtils .build(AnalyticThreadJobInputConfigBean.class).done().get())) .with(AnalyticThreadJobInputConfigBean::test_record_limit_request, //(if not test, always null; else "input override" or "output default") debug_max.map(max -> Optionals .of(() -> input.config().test_record_limit_request()) .orElse(max)).orElse(null)) .done()) .done(); // Get the paths and add them to a list for later final List<String> paths = _batchEnrichmentContext.getAnalyticsContext().getInputPaths( Optional.of(bucket), _batchEnrichmentContext.getJob(), input_with_test_settings); RScriptUtils.addFilePaths(paths); if (!paths.isEmpty()) { logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";")))); final Job inputJob = Job.getInstance(config); inputJob.setInputFormatClass(BeFileInputFormat.class); paths.stream().forEach(Lambdas.wrap_consumer_u( path -> FileInputFormat.addInputPath(inputJob, new Path(path)))); inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob); } else { // not easily available in HDFS directory format, try getting from the context Optional<HadoopAccessContext> input_format_info = _batchEnrichmentContext .getAnalyticsContext().getServiceInput(HadoopAccessContext.class, Optional.of(bucket), _batchEnrichmentContext.getJob(), input_with_test_settings); if (!input_format_info.isPresent()) { logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings))); } else { logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(), input_format_info.get().describe())); final Job inputJob = Job.getInstance(config); inputJob.setInputFormatClass(input_format_info.get().getAccessService() .either(l -> l.getClass(), r -> r)); input_format_info.get().getAccessConfig().ifPresent(map -> { map.entrySet().forEach(kv -> inputJob.getConfiguration().set(kv.getKey(), kv.getValue().toString())); }); inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob); } } })); // (ALEPH-12): other input format types // Now do everything else final String contextSignature = _batchEnrichmentContext .getEnrichmentContextSignature(Optional.of(bucket), Optional.empty()); config.set(BatchEnrichmentJob.BE_CONTEXT_SIGNATURE, contextSignature); final String jobName = BucketUtils.getUniqueSignature(bucket.full_name(), Optional.ofNullable(_batchEnrichmentContext.getJob().name())); this.handleHadoopConfigOverrides(bucket, config); // do not set anything into config past this line (can set job.getConfiguration() elements though - that is what the builder does) job.set(Job.getInstance(config, jobName)); job.get().setJarByClass(BatchEnrichmentJob.class); job.get().setSortComparatorClass(ObjectNodeWritableComparable.Comparator.class); //(avoid deser of json node for intermediate things) // Set the classpath cacheJars(job.get(), bucket, _batchEnrichmentContext.getAnalyticsContext()); // (generic mapper - the actual code is run using the classes in the shared libraries) job.get().setMapperClass(BatchEnrichmentJob.BatchEnrichmentMapper.class); job.get().setMapOutputKeyClass(ObjectNodeWritableComparable.class); job.get().setMapOutputValueClass(ObjectNodeWritableComparable.class); // (combiner and reducer) Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList()).stream() .filter(cfg -> Optional.ofNullable(cfg.enabled()).orElse(true)) .filter(cfg -> !Optionals.ofNullable(cfg.grouping_fields()).isEmpty()).findAny().map(cfg -> { final HadoopTechnologyOverrideBean tech_override = BeanTemplateUtils .from(Optional.ofNullable(cfg.technology_override()).orElse(Collections.emptyMap()), HadoopTechnologyOverrideBean.class) .get(); job.get().setNumReduceTasks(Optional.ofNullable(tech_override.num_reducers()).orElse(2)); job.get().setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class); if (tech_override.use_combiner()) { job.get().setCombinerClass(BatchEnrichmentJob.BatchEnrichmentCombiner.class); } return Unit.unit(); }).orElseGet(() -> { job.get().setNumReduceTasks(0); return Unit.unit(); }); // job.setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class); // Input format: inputBuilder.build(job.get()); // Output format (doesn't really do anything, all the actual output code is performed by the mapper via the enrichment context) job.get().setOutputFormatClass(BeFileOutputFormat.class); // Submit the job for processing launch(job.get()); // Wait for the job to complete and collect the data // job.get().waitForCompletion(true); return Validation.success(job.get()); } catch (Throwable t) { Throwable tt = (t instanceof RuntimeException) ? (null != t.getCause()) ? t.getCause() : t : t; if (tt instanceof org.apache.hadoop.mapreduce.lib.input.InvalidInputException) { // Probably a benign "no matching paths", so return pithy error return Validation.fail(ErrorUtils.get("{0}", tt.getMessage())); } else { // General error : Dump the config params to string if (job.isSet()) { logger.error(ErrorUtils.get("Error submitting, config= {0}", Optionals.streamOf(job.get().getConfiguration().iterator(), false) .map(kv -> kv.getKey() + ":" + kv.getValue()) .collect(Collectors.joining("; ")))); } return Validation.fail(ErrorUtils.getLongForm("{0}", tt)); } } finally { Thread.currentThread().setContextClassLoader(currentClassloader); } }
From source file:com.ikanow.aleph2.analytics.spark.utils.SparkTechnologyUtils.java
License:Apache License
/** Builds objects for all the aleph2 inputs and provides a method to use them in context-dependent ways * @param context// w ww .j av a 2s . c o m * @param bucket * @param job * @param config * @param per_input_action - user lambda that determines how they are used */ public static final void buildAleph2Inputs(final IAnalyticsContext context, final DataBucketBean bucket, final AnalyticThreadJobBean job, final Optional<ProcessingTestSpecBean> maybe_test_spec, final Configuration config, final Set<String> exclude_names, BiConsumer<AnalyticThreadJobInputBean, Job> per_input_action) { transformInputBean(Optionals.ofNullable(job.inputs()).stream(), maybe_test_spec) .filter(input -> !exclude_names.contains(input.name())) .forEach(Lambdas.wrap_consumer_u(input_with_test_settings -> { final Optional<IBucketLogger> a2_logger = Optional .ofNullable(context.getLogger(Optional.of(bucket))); final List<String> paths = context.getInputPaths(Optional.empty(), job, input_with_test_settings); if (!paths.isEmpty()) { _logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";")))); a2_logger.ifPresent(l -> l.log(Level.INFO, true, () -> ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";"))), () -> SparkTechnologyService.class.getSimpleName() + "." + Optional.ofNullable(job.name()).orElse("no_name"), () -> "startAnalyticJobOrTest")); //DEBUG //System.out.println(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";")))); final Job input_job = Job.getInstance(config); input_job.setInputFormatClass(BeFileInputFormat_Pure.class); paths.stream().forEach(Lambdas .wrap_consumer_u(path -> FileInputFormat.addInputPath(input_job, new Path(path)))); // (Add the input config in) input_job.getConfiguration().set(HadoopBatchEnrichmentUtils.BE_BUCKET_INPUT_CONFIG, BeanTemplateUtils.toJson(input_with_test_settings).toString()); per_input_action.accept(input_with_test_settings, input_job); } else { // not easily available in HDFS directory format, try getting from the context Optional<HadoopBatchEnrichmentUtils.HadoopAccessContext> input_format_info = context .getServiceInput(HadoopBatchEnrichmentUtils.HadoopAccessContext.class, Optional.empty(), job, input_with_test_settings); if (!input_format_info.isPresent()) { _logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings))); a2_logger.ifPresent(l -> l.log(Level.WARN, true, () -> ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings)), () -> SparkTechnologyService.class.getSimpleName() + "." + Optional.ofNullable(job.name()).orElse("no_name"), () -> "startAnalyticJobOrTest")); //DEBUG //System.out.println(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings))); } else { _logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(), input_format_info.get().describe())); a2_logger.ifPresent(l -> l.log(Level.INFO, true, () -> ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(), input_format_info.get().describe()), () -> SparkTechnologyService.class.getSimpleName() + "." + Optional.ofNullable(job.name()).orElse("no_name"), () -> "startAnalyticJobOrTest")); //DEBUG //System.out.println(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(),input_format_info.get().describe())); final Job input_job = Job.getInstance(config); input_job.setInputFormatClass( input_format_info.get().getAccessService().either(l -> l.getClass(), r -> r)); input_format_info.get().getAccessConfig().ifPresent(map -> { map.entrySet().forEach(kv -> input_job.getConfiguration().set(kv.getKey(), kv.getValue().toString())); }); per_input_action.accept(input_with_test_settings, input_job); } } })); }
From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" }) public String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation) throws IOException, SAXException, ParserConfigurationException { StringWriter xml = new StringWriter(); String outputCollection = job.outputCollectionTemp;// (non-append mode) if ((null != job.appendResults) && job.appendResults) outputCollection = job.outputCollection; // (append mode, write directly in....) else if (null != job.incrementalMode) job.incrementalMode = false; // (not allowed to be in incremental mode and not update mode) createConfigXML(xml, job.jobtitle, job.inputCollection, InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS), job.isCustomTable, job.getOutputDatabase(), job._id.toString(), outputCollection, job.mapper, job.reducer, job.combiner,/*from ww w . j a va2 s .com*/ InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY), job.communityIds, job.outputKey, job.outputValue, job.arguments, job.incrementalMode, job.submitterID, job.selfMerge, job.outputCollection, job.appendResults); ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader(); URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, savedClassLoader); Thread.currentThread().setContextClassLoader(child); // Check version: for now, any infinit.e.data_model with an VersionTest class is acceptable boolean dataModelLoaded = true; try { URLClassLoader versionTest = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, null); try { Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, versionTest); } catch (ClassNotFoundException e2) { //(this is fine, will use the cached version) dataModelLoaded = false; } if (dataModelLoaded) Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoVersionTest", true, versionTest); } catch (ClassNotFoundException e1) { throw new RuntimeException( "This JAR is compiled with too old a version of the data-model, please recompile with Jan 2014 (rc2) onwards"); } // Now load the XML into a configuration object: Configuration config = new Configuration(); // Add the client configuration overrides: if (!bLocalMode) { String hadoopConfigPath = props_custom.getHadoopConfigPath() + "/hadoop/"; config.addResource(new Path(hadoopConfigPath + "core-site.xml")); config.addResource(new Path(hadoopConfigPath + "mapred-site.xml")); config.addResource(new Path(hadoopConfigPath + "hadoop-site.xml")); } //TESTED try { DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes())); NodeList nList = doc.getElementsByTagName("property"); for (int temp = 0; temp < nList.getLength(); temp++) { Node nNode = nList.item(temp); if (nNode.getNodeType() == Node.ELEMENT_NODE) { Element eElement = (Element) nNode; String name = getTagValue("name", eElement); String value = getTagValue("value", eElement); if ((null != name) && (null != value)) { config.set(name, value); } } } } catch (Exception e) { throw new IOException(e.getMessage()); } // Some other config defaults: // (not sure if these are actually applied, or derived from the defaults - for some reason they don't appear in CDH's client config) config.set("mapred.map.tasks.speculative.execution", "false"); config.set("mapred.reduce.tasks.speculative.execution", "false"); // (default security is ignored here, have it set via HADOOP_TASKTRACKER_CONF in cloudera) // Now run the JAR file try { BasicDBObject advancedConfigurationDbo = null; try { advancedConfigurationDbo = (null != job.query) ? ((BasicDBObject) com.mongodb.util.JSON.parse(job.query)) : (new BasicDBObject()); } catch (Exception e) { advancedConfigurationDbo = new BasicDBObject(); } boolean esMode = advancedConfigurationDbo.containsField("qt") && !job.isCustomTable; if (esMode && !job.inputCollection.equals("doc_metadata.metadata")) { throw new RuntimeException( "Infinit.e Queries are only supported on doc_metadata - use MongoDB queries instead."); } config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing) if (bLocalMode) { // local job tracker and FS mode config.set("mapred.job.tracker", "local"); config.set("fs.default.name", "local"); } else { if (bTestMode) { // run job tracker locally but FS mode remotely config.set("mapred.job.tracker", "local"); } else { // normal job tracker String trackerUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker"); config.set("mapred.job.tracker", trackerUrl); } String fsUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name"); config.set("fs.default.name", fsUrl); } if (!dataModelLoaded && !(bTestMode || bLocalMode)) { // If running distributed and no data model loaded then add ourselves Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.data_model.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.processing.custom.library.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } //TESTED // Debug scripts (only if they exist), and only in non local/test mode if (!bLocalMode && !bTestMode) { try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_map_error_handler.sh", config); config.set("mapred.map.task.debug.script", "custom_map_error_handler.sh " + job.jobtitle); config.set("mapreduce.map.debug.script", "custom_map_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_reduce_error_handler.sh", config); config.set("mapred.reduce.task.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); config.set("mapreduce.reduce.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on } //TODO (???): TOTEST // (need to do these 2 things here before the job is created, at which point the config class has been copied across) //1) Class<?> mapperClazz = Class.forName(job.mapper, true, child); if (ICustomInfiniteInternalEngine.class.isAssignableFrom(mapperClazz)) { // Special case: internal custom engine, so gets an additional integration hook ICustomInfiniteInternalEngine preActivities = (ICustomInfiniteInternalEngine) mapperClazz .newInstance(); preActivities.preTaskActivities(job._id, job.communityIds, config, !(bTestMode || bLocalMode)); } //TESTED //2) if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { // Need to download the GridFSZip file try { Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/unbundled/", "GridFSZipFile.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } catch (Throwable t) { } // (this is fine, will already be on the classpath .. otherwise lots of other stuff will be failing all over the place!) } if (job.inputCollection.equals("records")) { InfiniteElasticsearchHadoopUtils.handleElasticsearchInput(job, config, advancedConfigurationDbo); //(won't run under 0.19 so running with "records" should cause all sorts of exceptions) } //TESTED (by hand) if (bTestMode || bLocalMode) { // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); } // Manually specified caches List<URL> localJarCaches = InfiniteHadoopUtils.handleCacheList(advancedConfigurationDbo.get("$caches"), job, config, props_custom); Job hj = new Job(config); // (NOTE: from here, changes to config are ignored) try { if (null != localJarCaches) { if (bLocalMode || bTestMode) { Method method = URLClassLoader.class.getDeclaredMethod("addURL", new Class[] { URL.class }); method.setAccessible(true); method.invoke(child, localJarCaches.toArray()); } //TOTEST (tested logically) } Class<?> classToLoad = Class.forName(job.mapper, true, child); hj.setJarByClass(classToLoad); if (job.inputCollection.equalsIgnoreCase("filesystem")) { String inputPath = null; try { inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); if (!inputPath.endsWith("/")) { inputPath = inputPath + "/"; } } catch (Exception e) { } if (null == inputPath) { throw new RuntimeException("Must specify 'file.url' if reading from filesystem."); } inputPath = InfiniteHadoopUtils.authenticateInputDirectory(job, inputPath); InfiniteFileInputFormat.addInputPath(hj, new Path(inputPath + "*/*")); // (that extra bit makes it recursive) InfiniteFileInputFormat.setMaxInputSplitSize(hj, 33554432); // (32MB) InfiniteFileInputFormat.setInfiniteInputPathFilter(hj, config); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat", true, child)); } else if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { String[] oidStrs = null; try { String inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); Pattern oidExtractor = Pattern.compile("inf://share/([^/]+)"); Matcher m = oidExtractor.matcher(inputPath); if (m.find()) { oidStrs = m.group(1).split("\\s*,\\s*"); } else { throw new RuntimeException( "file.url must be in format inf://share/<oid-list>/<string>: " + inputPath); } InfiniteHadoopUtils.authenticateShareList(job, oidStrs); } catch (Exception e) { throw new RuntimeException( "Authentication error: " + e.getMessage() + ": " + advancedConfigurationDbo, e); } hj.getConfiguration().setStrings("mapred.input.dir", oidStrs); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat", true, child)); } else if (job.inputCollection.equals("records")) { hj.setInputFormatClass((Class<? extends InputFormat>) Class .forName("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat", true, child)); } else { if (esMode) { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat", true, child)); } else { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child)); } } if ((null != job.exportToHdfs) && job.exportToHdfs) { //TODO (INF-2469): Also, if the output key is BSON then also run as text (but output as JSON?) Path outPath = InfiniteHadoopUtils.ensureOutputDirectory(job, props_custom); if ((null != job.outputKey) && (null != job.outputValue) && job.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text") && job.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) { // (slight hack before I sort out the horrendous job class - if key/val both text and exporting to HDFS then output as Text) hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", true, child)); TextOutputFormat.setOutputPath(hj, outPath); } //TESTED else { hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child)); SequenceFileOutputFormat.setOutputPath(hj, outPath); } //TESTED } else { // normal case, stays in MongoDB hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat", true, child)); } hj.setMapperClass((Class<? extends Mapper>) mapperClazz); String mapperOutputKeyOverride = advancedConfigurationDbo.getString("$mapper_key_class", null); if (null != mapperOutputKeyOverride) { hj.setMapOutputKeyClass(Class.forName(mapperOutputKeyOverride)); } //TESTED String mapperOutputValueOverride = advancedConfigurationDbo.getString("$mapper_value_class", null); if (null != mapperOutputValueOverride) { hj.setMapOutputValueClass(Class.forName(mapperOutputValueOverride)); } //TESTED if ((null != job.reducer) && !job.reducer.startsWith("#") && !job.reducer.equalsIgnoreCase("null") && !job.reducer.equalsIgnoreCase("none")) { hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child)); // Variable reducers: if (null != job.query) { try { hj.setNumReduceTasks(advancedConfigurationDbo.getInt("$reducers", 1)); } catch (Exception e) { try { // (just check it's not a string that is a valid int) hj.setNumReduceTasks( Integer.parseInt(advancedConfigurationDbo.getString("$reducers", "1"))); } catch (Exception e2) { } } } //TESTED } else { hj.setNumReduceTasks(0); } if ((null != job.combiner) && !job.combiner.startsWith("#") && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) { hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child)); } hj.setOutputKeyClass(Class.forName(job.outputKey, true, child)); hj.setOutputValueClass(Class.forName(job.outputValue, true, child)); hj.setJobName(job.jobtitle); currJobName = job.jobtitle; } catch (Error e) { // (messing about with class loaders = lots of chances for errors!) throw new RuntimeException(e.getMessage(), e); } if (bTestMode || bLocalMode) { hj.submit(); currThreadId = null; Logger.getRootLogger().addAppender(this); currLocalJobId = hj.getJobID().toString(); currLocalJobErrs.setLength(0); while (!hj.isComplete()) { Thread.sleep(1000); } Logger.getRootLogger().removeAppender(this); if (hj.isSuccessful()) { if (this.currLocalJobErrs.length() > 0) { return "local_done: " + this.currLocalJobErrs.toString(); } else { return "local_done"; } } else { return "Error: " + this.currLocalJobErrs.toString(); } } else { hj.submit(); String jobId = hj.getJobID().toString(); return jobId; } } catch (Exception e) { e.printStackTrace(); Thread.currentThread().setContextClassLoader(savedClassLoader); return "Error: " + InfiniteHadoopUtils.createExceptionMessage(e); } finally { Thread.currentThread().setContextClassLoader(savedClassLoader); } }
From source file:com.impetus.code.examples.hadoop.cassandra.wordcount.WordCount.java
License:Apache License
public int run(String[] args) throws Exception { String outputReducerType = "cassandra"; if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) { String[] s = args[0].split("="); if (s != null && s.length == 2) outputReducerType = s[1];/*from w ww . j a va 2 s . c o m*/ } logger.info("output reducer type: " + outputReducerType); for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) { String columnName = "text" + i; getConf().set(CONF_COLUMN_NAME, columnName); Job job = new Job(getConf(), "wordcount"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); if (outputReducerType.equalsIgnoreCase("filesystem")) { job.setCombinerClass(ReducerToFilesystem.class); job.setReducerClass(ReducerToFilesystem.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i)); } else { job.setReducerClass(ReducerToCassandra.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); job.setOutputFormatClass(ColumnFamilyOutputFormat.class); ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY); } job.setInputFormatClass(ColumnFamilyInputFormat.class); ConfigHelper.setRpcPort(job.getConfiguration(), "9160"); ConfigHelper.setInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner"); ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, INPUT_COLUMN_FAMILY); SlicePredicate predicate = new SlicePredicate() .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName))); ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate); job.waitForCompletion(true); } return 0; }
From source file:com.impetus.code.examples.hadoop.cassandra.wordcount.WordCountCounters.java
License:Apache License
public int run(String[] args) throws Exception { Job job = new Job(getConf(), "wordcountcounters"); job.setJarByClass(WordCountCounters.class); job.setMapperClass(SumMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX)); job.setInputFormatClass(ColumnFamilyInputFormat.class); ConfigHelper.setRpcPort(job.getConfiguration(), "9160"); ConfigHelper.setInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner"); ConfigHelper.setInputColumnFamily(job.getConfiguration(), WordCount.KEYSPACE, WordCountCounters.COUNTER_COLUMN_FAMILY); SlicePredicate predicate = new SlicePredicate() .setSlice_range(new SliceRange().setStart(ByteBufferUtil.EMPTY_BYTE_BUFFER) .setFinish(ByteBufferUtil.EMPTY_BYTE_BUFFER).setCount(100)); ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate); job.waitForCompletion(true);/* ww w .j a va 2 s . c o m*/ return 0; }
From source file:com.infochimps.hadoop.pig.hbase.StaticFamilyStorage.java
License:Apache License
@Override public void setLocation(String location, Job job) throws IOException { m_conf = job.getConfiguration(); HBaseConfiguration.addHbaseResources(m_conf); if (m_conf.get(HAS_BEEN_UPLOADED) == null) { HadoopUtils.uploadLocalFile(new Path(LOCAL_SCHEME + hbaseConfig_), new Path(HBASE_CONFIG_HDFS_PATH), m_conf);/*w w w . j a v a 2 s . c om*/ HadoopUtils.shipIfNotShipped(new Path(HBASE_CONFIG_HDFS_PATH), m_conf); m_conf.set(HAS_BEEN_UPLOADED, "true"); } String taskConfig = HadoopUtils.fetchFromCache((new File(hbaseConfig_)).getName(), m_conf); if (taskConfig == null) taskConfig = hbaseConfig_; m_conf.addResource(new Path(LOCAL_SCHEME + taskConfig)); TableMapReduceUtil.addDependencyJars(job.getConfiguration(), org.apache.hadoop.hbase.client.HTable.class, com.google.common.collect.Lists.class, org.apache.zookeeper.ZooKeeper.class); String tablename = location; if (location.startsWith("hbase://")) { tablename = location.substring(8); } if (m_table == null) { m_table = new HTable(m_conf, tablename); } m_table.setScannerCaching(caching_); m_conf.set(TableInputFormat.INPUT_TABLE, tablename); // Set up scan if it is not already set up. if (m_conf.get(TableInputFormat.SCAN) != null) { return; } for (ColumnInfo columnInfo : columnInfo_) { // do we have a column family, or a column? if (columnInfo.isColumnMap()) { scan.addFamily(columnInfo.getColumnFamily()); } else { scan.addColumn(columnInfo.getColumnFamily(), columnInfo.getColumnName()); } } if (requiredFieldList != null) { Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass(), new String[] { contextSignature }); p.setProperty(contextSignature + "_projectedFields", ObjectSerializer.serialize(requiredFieldList)); } m_conf.set(TableInputFormat.SCAN, convertScanToString(scan)); }
From source file:com.infochimps.hadoop.pig.hbase.StaticFamilyStorage.java
License:Apache License
@Override public void setStoreLocation(String location, Job job) throws IOException { if (location.startsWith("hbase://")) { job.getConfiguration().set(HBaseTableOutputFormat.OUTPUT_TABLE, location.substring(8)); } else {//from ww w . ja v a 2 s .c o m job.getConfiguration().set(HBaseTableOutputFormat.OUTPUT_TABLE, location); } Properties props = UDFContext.getUDFContext().getUDFProperties(getClass(), new String[] { contextSignature }); if (!props.containsKey(contextSignature + "_schema")) { props.setProperty(contextSignature + "_schema", ObjectSerializer.serialize(schema_)); } m_conf = HBaseConfiguration.addHbaseResources(job.getConfiguration()); if (m_conf.get(HAS_BEEN_UPLOADED) == null) { HadoopUtils.uploadLocalFile(new Path(LOCAL_SCHEME + hbaseConfig_), new Path(HBASE_CONFIG_HDFS_PATH), m_conf); HadoopUtils.shipIfNotShipped(new Path(HBASE_CONFIG_HDFS_PATH), m_conf); m_conf.set(HAS_BEEN_UPLOADED, "true"); } }
From source file:com.inmobi.conduit.distcp.ConduitDistCp.java
License:Apache License
@Override protected Path createInputFileListing(Job job) throws IOException { // get the file path where copy listing file has to be saved Path fileListingPath = getFileListingPath(); Configuration config = job.getConfiguration(); SequenceFile.Writer fileListWriter = null; try {//from w w w.j a va 2 s. com fileListWriter = SequenceFile.createWriter(fileListingPath.getFileSystem(config), config, fileListingPath, Text.class, FileStatus.class, SequenceFile.CompressionType.NONE); for (Map.Entry<String, FileStatus> entry : fileListingMap.entrySet()) { FileStatus status = FileUtil.getFileStatus(entry.getValue(), buffer, in); fileListWriter.append(new Text(entry.getKey()), status); // Create a sync point after each entry. This will ensure that SequenceFile // Reader can work at file entry level granularity, given that SequenceFile // Reader reads from the starting of sync point. fileListWriter.sync(); totalBytesToCopy += entry.getValue().getLen(); totalPaths++; } } finally { if (fileListWriter != null) { fileListWriter.close(); } } LOG.info("Number of paths considered for copy: " + totalPaths); LOG.info("Number of bytes considered for copy: " + totalBytesToCopy + " (Actual number of bytes copied depends on whether any files are " + "skipped or overwritten.)"); // set distcp configurations config.set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, fileListingPath.toString()); config.setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, totalBytesToCopy); config.setLong(DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS, totalPaths); return fileListingPath; }
From source file:com.inmobi.conduit.distcp.tools.DistCp.java
License:Apache License
/** * Implements the core-execution. Creates the file-list for copy, * and launches the Hadoop-job, to do the copy. * @return Job handle//from ww w.ja v a 2s . co m * @throws Exception, on failure. */ public Job execute() throws Exception { assert inputOptions != null; assert getConf() != null; Job job = null; try { metaFolder = createMetaFolderPath(); jobFS = metaFolder.getFileSystem(getConf()); job = createJob(); createInputFileListing(job); job.submit(); submitted = true; } finally { if (!submitted) { cleanup(); } } String jobID = getJobID(job); job.getConfiguration().set(DistCpConstants.CONF_LABEL_DISTCP_JOB_ID, jobID); LOG.info("DistCp job-id: " + jobID); LOG.info("DistCp job may be tracked at: " + job.getTrackingURL()); LOG.info("To cancel, run the following command:\thadoop job -kill " + jobID); long jobStartTime = System.nanoTime(); if (inputOptions.shouldBlock() && !job.waitForCompletion(true)) { updateJobTimeInNanos(jobStartTime); throw new IOException("DistCp failure: Job " + jobID + " has failed. "); } updateJobTimeInNanos(jobStartTime); return job; }
From source file:com.inmobi.conduit.distcp.tools.DistCp.java
License:Apache License
/** * Create Job object for submitting it, with all the configuration * * @return Reference to job object.//from ww w . j a v a2 s . co m * @throws IOException - Exception if any */ protected Job createJob() throws IOException { String jobName = "distcp"; String userChosenName = getConf().get("mapred.job.name"); if (userChosenName != null) jobName += ": " + userChosenName; Job job = new Job(getConf(), jobName); job.setInputFormatClass(DistCpUtils.getStrategy(getConf(), inputOptions)); job.setJarByClass(CopyMapper.class); configureOutputFormat(job); job.setMapperClass(CopyMapper.class); job.setReducerClass(Reducer.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(CopyOutputFormat.class); job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false"); job.getConfiguration().set(DistCpConstants.CONF_LABEL_NUM_MAPS, String.valueOf(inputOptions.getMaxMaps())); if (inputOptions.getSslConfigurationFile() != null) { setupSSLConfig(job.getConfiguration()); } inputOptions.appendToConf(job.getConfiguration()); return job; }