Example usage for org.apache.hadoop.mapreduce Job getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getConfiguration.

Prototype

public Configuration getConfiguration()

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.ikanow.aleph2.analytics.r.services.BeJobLauncher.java

License:Apache License

@Override
public Validation<String, Job> runEnhancementJob(final DataBucketBean bucket,
        final Optional<ProcessingTestSpecBean> testSpec) {

    final Configuration config = getHadoopConfig();

    final ClassLoader currentClassloader = Thread.currentThread().getContextClassLoader();
    //(not currently used, but has proven useful in the past)

    final SetOnce<Job> job = new SetOnce<>();
    try {// ww  w.j a v  a2s  .co  m
        final Optional<Long> debug_max = testSpec
                .flatMap(testSpecVals -> Optional.ofNullable(testSpecVals.requested_num_objects()));

        //then gets applied to all the inputs:
        debug_max.ifPresent(val -> config.set(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE, val.toString()));

        final Aleph2MultiInputFormatBuilder inputBuilder = new Aleph2MultiInputFormatBuilder();

        // Validation:

        try {
            final BatchEnrichmentJob.BatchEnrichmentBaseValidator validator = new BatchEnrichmentJob.BatchEnrichmentBaseValidator();
            validator.setDataBucket(bucket);
            validator.setEnrichmentContext(_batchEnrichmentContext);
            validator.setEcMetadata(
                    Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList()));
            final List<BasicMessageBean> errs = validator.validate();
            if (errs.stream().anyMatch(b -> !b.success())) {
                return Validation.fail(ErrorUtils.get("Validation errors for {0}: {1}", bucket.full_name(),
                        errs.stream().map(
                                b -> ErrorUtils.get("{0}: {1}", b.success() ? "INFO" : "ERROR", b.message()))
                                .collect(Collectors.joining(";"))));
            }
        } catch (Throwable t) { // we'll log but carry on in this case...(in case there's some classloading shenanigans which won't affect the operation in hadoop)
            logger.error(
                    ErrorUtils.getLongForm("Failed validation, bucket: {1} error: {0}", t, bucket.full_name()));
        }

        // Create a separate InputFormat for every input (makes testing life easier)

        Optional.ofNullable(_batchEnrichmentContext.getJob().inputs()).orElse(Collections.emptyList()).stream()
                .filter(input -> Optional.ofNullable(input.enabled()).orElse(true))
                .forEach(Lambdas.wrap_consumer_u(input -> {
                    // In the debug case, transform the input to add the max record limit
                    final AnalyticThreadJobInputBean input_with_test_settings = BeanTemplateUtils.clone(input)
                            .with(AnalyticThreadJobInputBean::config, BeanTemplateUtils
                                    .clone(Optional.ofNullable(input.config()).orElseGet(() -> BeanTemplateUtils
                                            .build(AnalyticThreadJobInputConfigBean.class).done().get()))
                                    .with(AnalyticThreadJobInputConfigBean::test_record_limit_request, //(if not test, always null; else "input override" or "output default")
                                            debug_max.map(max -> Optionals
                                                    .of(() -> input.config().test_record_limit_request())
                                                    .orElse(max)).orElse(null))
                                    .done())
                            .done();

                    // Get the paths and add them to a list for later
                    final List<String> paths = _batchEnrichmentContext.getAnalyticsContext().getInputPaths(
                            Optional.of(bucket), _batchEnrichmentContext.getJob(), input_with_test_settings);

                    RScriptUtils.addFilePaths(paths);

                    if (!paths.isEmpty()) {

                        logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}",
                                bucket.full_name(), paths.stream().collect(Collectors.joining(";"))));

                        final Job inputJob = Job.getInstance(config);
                        inputJob.setInputFormatClass(BeFileInputFormat.class);
                        paths.stream().forEach(Lambdas.wrap_consumer_u(
                                path -> FileInputFormat.addInputPath(inputJob, new Path(path))));
                        inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob);
                    } else { // not easily available in HDFS directory format, try getting from the context

                        Optional<HadoopAccessContext> input_format_info = _batchEnrichmentContext
                                .getAnalyticsContext().getServiceInput(HadoopAccessContext.class,
                                        Optional.of(bucket), _batchEnrichmentContext.getJob(),
                                        input_with_test_settings);
                        if (!input_format_info.isPresent()) {
                            logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}",
                                    BeanTemplateUtils.toJson(input_with_test_settings)));
                        } else {
                            logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}",
                                    bucket.full_name(), input_format_info.get().describe()));

                            final Job inputJob = Job.getInstance(config);
                            inputJob.setInputFormatClass(input_format_info.get().getAccessService()
                                    .either(l -> l.getClass(), r -> r));
                            input_format_info.get().getAccessConfig().ifPresent(map -> {
                                map.entrySet().forEach(kv -> inputJob.getConfiguration().set(kv.getKey(),
                                        kv.getValue().toString()));
                            });

                            inputBuilder.addInput(UuidUtils.get().getRandomUuid(), inputJob);
                        }
                    }

                }));

        // (ALEPH-12): other input format types

        // Now do everything else

        final String contextSignature = _batchEnrichmentContext
                .getEnrichmentContextSignature(Optional.of(bucket), Optional.empty());
        config.set(BatchEnrichmentJob.BE_CONTEXT_SIGNATURE, contextSignature);

        final String jobName = BucketUtils.getUniqueSignature(bucket.full_name(),
                Optional.ofNullable(_batchEnrichmentContext.getJob().name()));

        this.handleHadoopConfigOverrides(bucket, config);

        // do not set anything into config past this line (can set job.getConfiguration() elements though - that is what the builder does)
        job.set(Job.getInstance(config, jobName));
        job.get().setJarByClass(BatchEnrichmentJob.class);
        job.get().setSortComparatorClass(ObjectNodeWritableComparable.Comparator.class); //(avoid deser of json node for intermediate things)

        // Set the classpath

        cacheJars(job.get(), bucket, _batchEnrichmentContext.getAnalyticsContext());

        // (generic mapper - the actual code is run using the classes in the shared libraries)
        job.get().setMapperClass(BatchEnrichmentJob.BatchEnrichmentMapper.class);
        job.get().setMapOutputKeyClass(ObjectNodeWritableComparable.class);
        job.get().setMapOutputValueClass(ObjectNodeWritableComparable.class);

        // (combiner and reducer)
        Optional.ofNullable(bucket.batch_enrichment_configs()).orElse(Collections.emptyList()).stream()
                .filter(cfg -> Optional.ofNullable(cfg.enabled()).orElse(true))
                .filter(cfg -> !Optionals.ofNullable(cfg.grouping_fields()).isEmpty()).findAny().map(cfg -> {
                    final HadoopTechnologyOverrideBean tech_override = BeanTemplateUtils
                            .from(Optional.ofNullable(cfg.technology_override()).orElse(Collections.emptyMap()),
                                    HadoopTechnologyOverrideBean.class)
                            .get();

                    job.get().setNumReduceTasks(Optional.ofNullable(tech_override.num_reducers()).orElse(2));
                    job.get().setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class);

                    if (tech_override.use_combiner()) {
                        job.get().setCombinerClass(BatchEnrichmentJob.BatchEnrichmentCombiner.class);
                    }
                    return Unit.unit();
                }).orElseGet(() -> {
                    job.get().setNumReduceTasks(0);
                    return Unit.unit();
                });

        // job.setReducerClass(BatchEnrichmentJob.BatchEnrichmentReducer.class);

        // Input format:
        inputBuilder.build(job.get());

        // Output format (doesn't really do anything, all the actual output code is performed by the mapper via the enrichment context)
        job.get().setOutputFormatClass(BeFileOutputFormat.class);

        // Submit the job for processing
        launch(job.get());

        // Wait for the job to complete and collect the data
        //            job.get().waitForCompletion(true);

        return Validation.success(job.get());

    } catch (Throwable t) {
        Throwable tt = (t instanceof RuntimeException) ? (null != t.getCause()) ? t.getCause() : t : t;

        if (tt instanceof org.apache.hadoop.mapreduce.lib.input.InvalidInputException) {
            // Probably a benign "no matching paths", so return pithy error
            return Validation.fail(ErrorUtils.get("{0}", tt.getMessage()));
        } else { // General error : Dump the config params to string         
            if (job.isSet()) {
                logger.error(ErrorUtils.get("Error submitting, config= {0}",
                        Optionals.streamOf(job.get().getConfiguration().iterator(), false)
                                .map(kv -> kv.getKey() + ":" + kv.getValue())
                                .collect(Collectors.joining("; "))));
            }
            return Validation.fail(ErrorUtils.getLongForm("{0}", tt));
        }
    } finally {
        Thread.currentThread().setContextClassLoader(currentClassloader);
    }

}

From source file:com.ikanow.aleph2.analytics.spark.utils.SparkTechnologyUtils.java

License:Apache License

/** Builds objects for all the aleph2 inputs and provides a method to use them in context-dependent ways 
 * @param context//  w ww  .j av  a  2s . c o  m
 * @param bucket
 * @param job
 * @param config
 * @param per_input_action - user lambda that determines how they are used
 */
public static final void buildAleph2Inputs(final IAnalyticsContext context, final DataBucketBean bucket,
        final AnalyticThreadJobBean job, final Optional<ProcessingTestSpecBean> maybe_test_spec,
        final Configuration config, final Set<String> exclude_names,
        BiConsumer<AnalyticThreadJobInputBean, Job> per_input_action) {
    transformInputBean(Optionals.ofNullable(job.inputs()).stream(), maybe_test_spec)
            .filter(input -> !exclude_names.contains(input.name()))
            .forEach(Lambdas.wrap_consumer_u(input_with_test_settings -> {

                final Optional<IBucketLogger> a2_logger = Optional
                        .ofNullable(context.getLogger(Optional.of(bucket)));

                final List<String> paths = context.getInputPaths(Optional.empty(), job,
                        input_with_test_settings);

                if (!paths.isEmpty()) {

                    _logger.info(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(),
                            paths.stream().collect(Collectors.joining(";"))));

                    a2_logger.ifPresent(l -> l.log(Level.INFO, true,
                            () -> ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(),
                                    paths.stream().collect(Collectors.joining(";"))),
                            () -> SparkTechnologyService.class.getSimpleName() + "."
                                    + Optional.ofNullable(job.name()).orElse("no_name"),
                            () -> "startAnalyticJobOrTest"));

                    //DEBUG
                    //System.out.println(ErrorUtils.get("Adding storage paths for bucket {0}: {1}", bucket.full_name(), paths.stream().collect(Collectors.joining(";"))));   

                    final Job input_job = Job.getInstance(config);
                    input_job.setInputFormatClass(BeFileInputFormat_Pure.class);
                    paths.stream().forEach(Lambdas
                            .wrap_consumer_u(path -> FileInputFormat.addInputPath(input_job, new Path(path))));
                    // (Add the input config in)
                    input_job.getConfiguration().set(HadoopBatchEnrichmentUtils.BE_BUCKET_INPUT_CONFIG,
                            BeanTemplateUtils.toJson(input_with_test_settings).toString());
                    per_input_action.accept(input_with_test_settings, input_job);
                } else { // not easily available in HDFS directory format, try getting from the context

                    Optional<HadoopBatchEnrichmentUtils.HadoopAccessContext> input_format_info = context
                            .getServiceInput(HadoopBatchEnrichmentUtils.HadoopAccessContext.class,
                                    Optional.empty(), job, input_with_test_settings);
                    if (!input_format_info.isPresent()) {
                        _logger.warn(ErrorUtils.get("Tried but failed to get input format from {0}",
                                BeanTemplateUtils.toJson(input_with_test_settings)));

                        a2_logger.ifPresent(l -> l.log(Level.WARN, true,
                                () -> ErrorUtils.get("Tried but failed to get input format from {0}",
                                        BeanTemplateUtils.toJson(input_with_test_settings)),
                                () -> SparkTechnologyService.class.getSimpleName() + "."
                                        + Optional.ofNullable(job.name()).orElse("no_name"),
                                () -> "startAnalyticJobOrTest"));

                        //DEBUG
                        //System.out.println(ErrorUtils.get("Tried but failed to get input format from {0}", BeanTemplateUtils.toJson(input_with_test_settings)));
                    } else {
                        _logger.info(ErrorUtils.get("Adding data service path for bucket {0}: {1}",
                                bucket.full_name(), input_format_info.get().describe()));

                        a2_logger.ifPresent(l -> l.log(Level.INFO, true,
                                () -> ErrorUtils.get("Adding data service path for bucket {0}: {1}",
                                        bucket.full_name(), input_format_info.get().describe()),
                                () -> SparkTechnologyService.class.getSimpleName() + "."
                                        + Optional.ofNullable(job.name()).orElse("no_name"),
                                () -> "startAnalyticJobOrTest"));

                        //DEBUG
                        //System.out.println(ErrorUtils.get("Adding data service path for bucket {0}: {1}", bucket.full_name(),input_format_info.get().describe()));

                        final Job input_job = Job.getInstance(config);
                        input_job.setInputFormatClass(
                                input_format_info.get().getAccessService().either(l -> l.getClass(), r -> r));
                        input_format_info.get().getAccessConfig().ifPresent(map -> {
                            map.entrySet().forEach(kv -> input_job.getConfiguration().set(kv.getKey(),
                                    kv.getValue().toString()));
                        });
                        per_input_action.accept(input_with_test_settings, input_job);
                    }
                }
            }));
}

From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java

License:Open Source License

@SuppressWarnings({ "unchecked", "rawtypes" })
public String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation)
        throws IOException, SAXException, ParserConfigurationException {
    StringWriter xml = new StringWriter();
    String outputCollection = job.outputCollectionTemp;// (non-append mode) 
    if ((null != job.appendResults) && job.appendResults)
        outputCollection = job.outputCollection; // (append mode, write directly in....)
    else if (null != job.incrementalMode)
        job.incrementalMode = false; // (not allowed to be in incremental mode and not update mode)

    createConfigXML(xml, job.jobtitle, job.inputCollection,
            InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS),
            job.isCustomTable, job.getOutputDatabase(), job._id.toString(), outputCollection, job.mapper,
            job.reducer, job.combiner,/*from  ww  w  .  j  a va2  s  .com*/
            InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY),
            job.communityIds, job.outputKey, job.outputValue, job.arguments, job.incrementalMode,
            job.submitterID, job.selfMerge, job.outputCollection, job.appendResults);

    ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader();

    URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() },
            savedClassLoader);
    Thread.currentThread().setContextClassLoader(child);

    // Check version: for now, any infinit.e.data_model with an VersionTest class is acceptable
    boolean dataModelLoaded = true;
    try {
        URLClassLoader versionTest = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() },
                null);
        try {
            Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, versionTest);
        } catch (ClassNotFoundException e2) {
            //(this is fine, will use the cached version)
            dataModelLoaded = false;
        }
        if (dataModelLoaded)
            Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoVersionTest", true, versionTest);
    } catch (ClassNotFoundException e1) {
        throw new RuntimeException(
                "This JAR is compiled with too old a version of the data-model, please recompile with Jan 2014 (rc2) onwards");
    }

    // Now load the XML into a configuration object: 
    Configuration config = new Configuration();
    // Add the client configuration overrides:
    if (!bLocalMode) {
        String hadoopConfigPath = props_custom.getHadoopConfigPath() + "/hadoop/";
        config.addResource(new Path(hadoopConfigPath + "core-site.xml"));
        config.addResource(new Path(hadoopConfigPath + "mapred-site.xml"));
        config.addResource(new Path(hadoopConfigPath + "hadoop-site.xml"));
    } //TESTED

    try {
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes()));
        NodeList nList = doc.getElementsByTagName("property");

        for (int temp = 0; temp < nList.getLength(); temp++) {
            Node nNode = nList.item(temp);
            if (nNode.getNodeType() == Node.ELEMENT_NODE) {
                Element eElement = (Element) nNode;
                String name = getTagValue("name", eElement);
                String value = getTagValue("value", eElement);
                if ((null != name) && (null != value)) {
                    config.set(name, value);
                }
            }
        }
    } catch (Exception e) {
        throw new IOException(e.getMessage());
    }

    // Some other config defaults:
    // (not sure if these are actually applied, or derived from the defaults - for some reason they don't appear in CDH's client config)
    config.set("mapred.map.tasks.speculative.execution", "false");
    config.set("mapred.reduce.tasks.speculative.execution", "false");
    // (default security is ignored here, have it set via HADOOP_TASKTRACKER_CONF in cloudera)

    // Now run the JAR file
    try {
        BasicDBObject advancedConfigurationDbo = null;
        try {
            advancedConfigurationDbo = (null != job.query)
                    ? ((BasicDBObject) com.mongodb.util.JSON.parse(job.query))
                    : (new BasicDBObject());
        } catch (Exception e) {
            advancedConfigurationDbo = new BasicDBObject();
        }
        boolean esMode = advancedConfigurationDbo.containsField("qt") && !job.isCustomTable;
        if (esMode && !job.inputCollection.equals("doc_metadata.metadata")) {
            throw new RuntimeException(
                    "Infinit.e Queries are only supported on doc_metadata - use MongoDB queries instead.");
        }

        config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing)
        if (bLocalMode) { // local job tracker and FS mode
            config.set("mapred.job.tracker", "local");
            config.set("fs.default.name", "local");
        } else {
            if (bTestMode) { // run job tracker locally but FS mode remotely
                config.set("mapred.job.tracker", "local");
            } else { // normal job tracker
                String trackerUrl = HadoopUtils.getXMLProperty(
                        props_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker");
                config.set("mapred.job.tracker", trackerUrl);
            }
            String fsUrl = HadoopUtils.getXMLProperty(
                    props_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name");
            config.set("fs.default.name", fsUrl);
        }
        if (!dataModelLoaded && !(bTestMode || bLocalMode)) { // If running distributed and no data model loaded then add ourselves
            Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/",
                    "infinit.e.data_model.jar", config);
            DistributedCache.addFileToClassPath(jarToCache, config);
            jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/",
                    "infinit.e.processing.custom.library.jar", config);
            DistributedCache.addFileToClassPath(jarToCache, config);
        } //TESTED

        // Debug scripts (only if they exist), and only in non local/test mode
        if (!bLocalMode && !bTestMode) {

            try {
                Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/",
                        "custom_map_error_handler.sh", config);
                config.set("mapred.map.task.debug.script", "custom_map_error_handler.sh " + job.jobtitle);
                config.set("mapreduce.map.debug.script", "custom_map_error_handler.sh " + job.jobtitle);
                DistributedCache.createSymlink(config);
                DistributedCache.addCacheFile(scriptToCache.toUri(), config);
            } catch (Exception e) {
            } // just carry on

            try {
                Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/",
                        "custom_reduce_error_handler.sh", config);
                config.set("mapred.reduce.task.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle);
                config.set("mapreduce.reduce.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle);
                DistributedCache.createSymlink(config);
                DistributedCache.addCacheFile(scriptToCache.toUri(), config);
            } catch (Exception e) {
            } // just carry on

        } //TODO (???): TOTEST

        // (need to do these 2 things here before the job is created, at which point the config class has been copied across)
        //1)
        Class<?> mapperClazz = Class.forName(job.mapper, true, child);
        if (ICustomInfiniteInternalEngine.class.isAssignableFrom(mapperClazz)) { // Special case: internal custom engine, so gets an additional integration hook
            ICustomInfiniteInternalEngine preActivities = (ICustomInfiniteInternalEngine) mapperClazz
                    .newInstance();
            preActivities.preTaskActivities(job._id, job.communityIds, config, !(bTestMode || bLocalMode));
        } //TESTED
          //2)
        if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) {
            // Need to download the GridFSZip file
            try {
                Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/unbundled/",
                        "GridFSZipFile.jar", config);
                DistributedCache.addFileToClassPath(jarToCache, config);
            } catch (Throwable t) {
            } // (this is fine, will already be on the classpath .. otherwise lots of other stuff will be failing all over the place!)            
        }

        if (job.inputCollection.equals("records")) {

            InfiniteElasticsearchHadoopUtils.handleElasticsearchInput(job, config, advancedConfigurationDbo);

            //(won't run under 0.19 so running with "records" should cause all sorts of exceptions)

        } //TESTED (by hand)         

        if (bTestMode || bLocalMode) { // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place
            config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec");
        }

        // Manually specified caches
        List<URL> localJarCaches = InfiniteHadoopUtils.handleCacheList(advancedConfigurationDbo.get("$caches"),
                job, config, props_custom);

        Job hj = new Job(config); // (NOTE: from here, changes to config are ignored)
        try {

            if (null != localJarCaches) {
                if (bLocalMode || bTestMode) {
                    Method method = URLClassLoader.class.getDeclaredMethod("addURL", new Class[] { URL.class });
                    method.setAccessible(true);
                    method.invoke(child, localJarCaches.toArray());

                } //TOTEST (tested logically)
            }
            Class<?> classToLoad = Class.forName(job.mapper, true, child);
            hj.setJarByClass(classToLoad);

            if (job.inputCollection.equalsIgnoreCase("filesystem")) {
                String inputPath = null;
                try {
                    inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url");
                    if (!inputPath.endsWith("/")) {
                        inputPath = inputPath + "/";
                    }
                } catch (Exception e) {
                }
                if (null == inputPath) {
                    throw new RuntimeException("Must specify 'file.url' if reading from filesystem.");
                }
                inputPath = InfiniteHadoopUtils.authenticateInputDirectory(job, inputPath);

                InfiniteFileInputFormat.addInputPath(hj, new Path(inputPath + "*/*")); // (that extra bit makes it recursive)
                InfiniteFileInputFormat.setMaxInputSplitSize(hj, 33554432); // (32MB)
                InfiniteFileInputFormat.setInfiniteInputPathFilter(hj, config);
                hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                        "com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat", true, child));
            } else if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) {

                String[] oidStrs = null;
                try {
                    String inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url");
                    Pattern oidExtractor = Pattern.compile("inf://share/([^/]+)");
                    Matcher m = oidExtractor.matcher(inputPath);
                    if (m.find()) {
                        oidStrs = m.group(1).split("\\s*,\\s*");

                    } else {
                        throw new RuntimeException(
                                "file.url must be in format inf://share/<oid-list>/<string>: " + inputPath);
                    }
                    InfiniteHadoopUtils.authenticateShareList(job, oidStrs);
                } catch (Exception e) {
                    throw new RuntimeException(
                            "Authentication error: " + e.getMessage() + ": " + advancedConfigurationDbo, e);
                }

                hj.getConfiguration().setStrings("mapred.input.dir", oidStrs);
                hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                        "com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat", true, child));
            } else if (job.inputCollection.equals("records")) {
                hj.setInputFormatClass((Class<? extends InputFormat>) Class
                        .forName("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat", true, child));
            } else {
                if (esMode) {
                    hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                            "com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat",
                            true, child));
                } else {
                    hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName(
                            "com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child));
                }
            }
            if ((null != job.exportToHdfs) && job.exportToHdfs) {

                //TODO (INF-2469): Also, if the output key is BSON then also run as text (but output as JSON?)

                Path outPath = InfiniteHadoopUtils.ensureOutputDirectory(job, props_custom);

                if ((null != job.outputKey) && (null != job.outputValue)
                        && job.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text")
                        && job.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) {
                    // (slight hack before I sort out the horrendous job class - if key/val both text and exporting to HDFS then output as Text)
                    hj.setOutputFormatClass((Class<? extends OutputFormat>) Class
                            .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", true, child));
                    TextOutputFormat.setOutputPath(hj, outPath);
                } //TESTED
                else {
                    hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(
                            "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child));
                    SequenceFileOutputFormat.setOutputPath(hj, outPath);
                } //TESTED
            } else { // normal case, stays in MongoDB
                hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName(
                        "com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat", true, child));
            }
            hj.setMapperClass((Class<? extends Mapper>) mapperClazz);
            String mapperOutputKeyOverride = advancedConfigurationDbo.getString("$mapper_key_class", null);
            if (null != mapperOutputKeyOverride) {
                hj.setMapOutputKeyClass(Class.forName(mapperOutputKeyOverride));
            } //TESTED 

            String mapperOutputValueOverride = advancedConfigurationDbo.getString("$mapper_value_class", null);
            if (null != mapperOutputValueOverride) {
                hj.setMapOutputValueClass(Class.forName(mapperOutputValueOverride));
            } //TESTED 

            if ((null != job.reducer) && !job.reducer.startsWith("#") && !job.reducer.equalsIgnoreCase("null")
                    && !job.reducer.equalsIgnoreCase("none")) {
                hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child));
                // Variable reducers:
                if (null != job.query) {
                    try {
                        hj.setNumReduceTasks(advancedConfigurationDbo.getInt("$reducers", 1));
                    } catch (Exception e) {
                        try {
                            // (just check it's not a string that is a valid int)
                            hj.setNumReduceTasks(
                                    Integer.parseInt(advancedConfigurationDbo.getString("$reducers", "1")));
                        } catch (Exception e2) {
                        }
                    }
                } //TESTED
            } else {
                hj.setNumReduceTasks(0);
            }
            if ((null != job.combiner) && !job.combiner.startsWith("#")
                    && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) {
                hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child));
            }
            hj.setOutputKeyClass(Class.forName(job.outputKey, true, child));
            hj.setOutputValueClass(Class.forName(job.outputValue, true, child));

            hj.setJobName(job.jobtitle);
            currJobName = job.jobtitle;
        } catch (Error e) { // (messing about with class loaders = lots of chances for errors!)
            throw new RuntimeException(e.getMessage(), e);
        }
        if (bTestMode || bLocalMode) {
            hj.submit();
            currThreadId = null;
            Logger.getRootLogger().addAppender(this);
            currLocalJobId = hj.getJobID().toString();
            currLocalJobErrs.setLength(0);
            while (!hj.isComplete()) {
                Thread.sleep(1000);
            }
            Logger.getRootLogger().removeAppender(this);
            if (hj.isSuccessful()) {
                if (this.currLocalJobErrs.length() > 0) {
                    return "local_done: " + this.currLocalJobErrs.toString();
                } else {
                    return "local_done";
                }
            } else {
                return "Error: " + this.currLocalJobErrs.toString();
            }
        } else {
            hj.submit();
            String jobId = hj.getJobID().toString();
            return jobId;
        }
    } catch (Exception e) {
        e.printStackTrace();
        Thread.currentThread().setContextClassLoader(savedClassLoader);
        return "Error: " + InfiniteHadoopUtils.createExceptionMessage(e);
    } finally {
        Thread.currentThread().setContextClassLoader(savedClassLoader);
    }
}

From source file:com.impetus.code.examples.hadoop.cassandra.wordcount.WordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    String outputReducerType = "cassandra";
    if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) {
        String[] s = args[0].split("=");
        if (s != null && s.length == 2)
            outputReducerType = s[1];/*from   w  ww . j  a va 2 s  . c o  m*/
    }
    logger.info("output reducer type: " + outputReducerType);

    for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) {
        String columnName = "text" + i;
        getConf().set(CONF_COLUMN_NAME, columnName);

        Job job = new Job(getConf(), "wordcount");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(TokenizerMapper.class);

        if (outputReducerType.equalsIgnoreCase("filesystem")) {
            job.setCombinerClass(ReducerToFilesystem.class);
            job.setReducerClass(ReducerToFilesystem.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);
            FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i));
        } else {
            job.setReducerClass(ReducerToCassandra.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);
            job.setOutputKeyClass(ByteBuffer.class);
            job.setOutputValueClass(List.class);

            job.setOutputFormatClass(ColumnFamilyOutputFormat.class);

            ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY);
        }

        job.setInputFormatClass(ColumnFamilyInputFormat.class);

        ConfigHelper.setRpcPort(job.getConfiguration(), "9160");
        ConfigHelper.setInitialAddress(job.getConfiguration(), "localhost");
        ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner");
        ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, INPUT_COLUMN_FAMILY);
        SlicePredicate predicate = new SlicePredicate()
                .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName)));
        ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);

        job.waitForCompletion(true);
    }
    return 0;
}

From source file:com.impetus.code.examples.hadoop.cassandra.wordcount.WordCountCounters.java

License:Apache License

public int run(String[] args) throws Exception {
    Job job = new Job(getConf(), "wordcountcounters");
    job.setJarByClass(WordCountCounters.class);
    job.setMapperClass(SumMapper.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX));

    job.setInputFormatClass(ColumnFamilyInputFormat.class);

    ConfigHelper.setRpcPort(job.getConfiguration(), "9160");
    ConfigHelper.setInitialAddress(job.getConfiguration(), "localhost");
    ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner");
    ConfigHelper.setInputColumnFamily(job.getConfiguration(), WordCount.KEYSPACE,
            WordCountCounters.COUNTER_COLUMN_FAMILY);
    SlicePredicate predicate = new SlicePredicate()
            .setSlice_range(new SliceRange().setStart(ByteBufferUtil.EMPTY_BYTE_BUFFER)
                    .setFinish(ByteBufferUtil.EMPTY_BYTE_BUFFER).setCount(100));
    ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate);

    job.waitForCompletion(true);/*  ww  w  .j a va  2 s . c o m*/
    return 0;
}

From source file:com.infochimps.hadoop.pig.hbase.StaticFamilyStorage.java

License:Apache License

@Override
public void setLocation(String location, Job job) throws IOException {
    m_conf = job.getConfiguration();

    HBaseConfiguration.addHbaseResources(m_conf);
    if (m_conf.get(HAS_BEEN_UPLOADED) == null) {
        HadoopUtils.uploadLocalFile(new Path(LOCAL_SCHEME + hbaseConfig_), new Path(HBASE_CONFIG_HDFS_PATH),
                m_conf);/*w w  w  . j a v a  2  s .  c  om*/
        HadoopUtils.shipIfNotShipped(new Path(HBASE_CONFIG_HDFS_PATH), m_conf);
        m_conf.set(HAS_BEEN_UPLOADED, "true");
    }
    String taskConfig = HadoopUtils.fetchFromCache((new File(hbaseConfig_)).getName(), m_conf);
    if (taskConfig == null)
        taskConfig = hbaseConfig_;
    m_conf.addResource(new Path(LOCAL_SCHEME + taskConfig));

    TableMapReduceUtil.addDependencyJars(job.getConfiguration(), org.apache.hadoop.hbase.client.HTable.class,
            com.google.common.collect.Lists.class, org.apache.zookeeper.ZooKeeper.class);

    String tablename = location;
    if (location.startsWith("hbase://")) {
        tablename = location.substring(8);
    }
    if (m_table == null) {
        m_table = new HTable(m_conf, tablename);
    }
    m_table.setScannerCaching(caching_);
    m_conf.set(TableInputFormat.INPUT_TABLE, tablename);

    // Set up scan if it is not already set up.
    if (m_conf.get(TableInputFormat.SCAN) != null) {
        return;
    }

    for (ColumnInfo columnInfo : columnInfo_) {
        // do we have a column family, or a column?
        if (columnInfo.isColumnMap()) {
            scan.addFamily(columnInfo.getColumnFamily());
        } else {
            scan.addColumn(columnInfo.getColumnFamily(), columnInfo.getColumnName());
        }

    }
    if (requiredFieldList != null) {
        Properties p = UDFContext.getUDFContext().getUDFProperties(this.getClass(),
                new String[] { contextSignature });
        p.setProperty(contextSignature + "_projectedFields", ObjectSerializer.serialize(requiredFieldList));
    }
    m_conf.set(TableInputFormat.SCAN, convertScanToString(scan));
}

From source file:com.infochimps.hadoop.pig.hbase.StaticFamilyStorage.java

License:Apache License

@Override
public void setStoreLocation(String location, Job job) throws IOException {
    if (location.startsWith("hbase://")) {
        job.getConfiguration().set(HBaseTableOutputFormat.OUTPUT_TABLE, location.substring(8));
    } else {//from   ww  w  .  ja v  a 2 s .c o  m
        job.getConfiguration().set(HBaseTableOutputFormat.OUTPUT_TABLE, location);
    }
    Properties props = UDFContext.getUDFContext().getUDFProperties(getClass(),
            new String[] { contextSignature });
    if (!props.containsKey(contextSignature + "_schema")) {
        props.setProperty(contextSignature + "_schema", ObjectSerializer.serialize(schema_));
    }
    m_conf = HBaseConfiguration.addHbaseResources(job.getConfiguration());
    if (m_conf.get(HAS_BEEN_UPLOADED) == null) {
        HadoopUtils.uploadLocalFile(new Path(LOCAL_SCHEME + hbaseConfig_), new Path(HBASE_CONFIG_HDFS_PATH),
                m_conf);
        HadoopUtils.shipIfNotShipped(new Path(HBASE_CONFIG_HDFS_PATH), m_conf);
        m_conf.set(HAS_BEEN_UPLOADED, "true");
    }
}

From source file:com.inmobi.conduit.distcp.ConduitDistCp.java

License:Apache License

@Override
protected Path createInputFileListing(Job job) throws IOException {
    // get the file path where copy listing file has to be saved
    Path fileListingPath = getFileListingPath();
    Configuration config = job.getConfiguration();

    SequenceFile.Writer fileListWriter = null;
    try {//from  w  w  w.j a  va  2  s.  com
        fileListWriter = SequenceFile.createWriter(fileListingPath.getFileSystem(config), config,
                fileListingPath, Text.class, FileStatus.class, SequenceFile.CompressionType.NONE);

        for (Map.Entry<String, FileStatus> entry : fileListingMap.entrySet()) {
            FileStatus status = FileUtil.getFileStatus(entry.getValue(), buffer, in);
            fileListWriter.append(new Text(entry.getKey()), status);

            // Create a sync point after each entry. This will ensure that SequenceFile
            // Reader can work at file entry level granularity, given that SequenceFile
            // Reader reads from the starting of sync point.
            fileListWriter.sync();

            totalBytesToCopy += entry.getValue().getLen();
            totalPaths++;
        }
    } finally {
        if (fileListWriter != null) {
            fileListWriter.close();
        }
    }

    LOG.info("Number of paths considered for copy: " + totalPaths);
    LOG.info("Number of bytes considered for copy: " + totalBytesToCopy
            + " (Actual number of bytes copied depends on whether any files are " + "skipped or overwritten.)");

    // set distcp configurations
    config.set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, fileListingPath.toString());
    config.setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, totalBytesToCopy);
    config.setLong(DistCpConstants.CONF_LABEL_TOTAL_NUMBER_OF_RECORDS, totalPaths);

    return fileListingPath;
}

From source file:com.inmobi.conduit.distcp.tools.DistCp.java

License:Apache License

/**
 * Implements the core-execution. Creates the file-list for copy,
 * and launches the Hadoop-job, to do the copy.
 * @return Job handle//from   ww w.ja v a  2s . co  m
 * @throws Exception, on failure.
 */
public Job execute() throws Exception {
    assert inputOptions != null;
    assert getConf() != null;

    Job job = null;
    try {
        metaFolder = createMetaFolderPath();
        jobFS = metaFolder.getFileSystem(getConf());

        job = createJob();
        createInputFileListing(job);

        job.submit();
        submitted = true;
    } finally {
        if (!submitted) {
            cleanup();
        }
    }

    String jobID = getJobID(job);
    job.getConfiguration().set(DistCpConstants.CONF_LABEL_DISTCP_JOB_ID, jobID);

    LOG.info("DistCp job-id: " + jobID);
    LOG.info("DistCp job may be tracked at: " + job.getTrackingURL());
    LOG.info("To cancel, run the following command:\thadoop job -kill " + jobID);

    long jobStartTime = System.nanoTime();
    if (inputOptions.shouldBlock() && !job.waitForCompletion(true)) {
        updateJobTimeInNanos(jobStartTime);
        throw new IOException("DistCp failure: Job " + jobID + " has failed. ");
    }
    updateJobTimeInNanos(jobStartTime);
    return job;
}

From source file:com.inmobi.conduit.distcp.tools.DistCp.java

License:Apache License

/**
 * Create Job object for submitting it, with all the configuration
 *
 * @return Reference to job object.//from ww  w .  j  a v  a2  s . co  m
 * @throws IOException - Exception if any
 */
protected Job createJob() throws IOException {
    String jobName = "distcp";
    String userChosenName = getConf().get("mapred.job.name");
    if (userChosenName != null)
        jobName += ": " + userChosenName;
    Job job = new Job(getConf(), jobName);
    job.setInputFormatClass(DistCpUtils.getStrategy(getConf(), inputOptions));
    job.setJarByClass(CopyMapper.class);
    configureOutputFormat(job);

    job.setMapperClass(CopyMapper.class);
    job.setReducerClass(Reducer.class);
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(CopyOutputFormat.class);
    job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false");
    job.getConfiguration().set(DistCpConstants.CONF_LABEL_NUM_MAPS, String.valueOf(inputOptions.getMaxMaps()));

    if (inputOptions.getSslConfigurationFile() != null) {
        setupSSLConfig(job.getConfiguration());
    }

    inputOptions.appendToConf(job.getConfiguration());
    return job;
}