Example usage for org.apache.hadoop.mapreduce Job addCacheArchive

List of usage examples for org.apache.hadoop.mapreduce Job addCacheArchive

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job addCacheArchive.

Prototype

public void addCacheArchive(URI uri) 

Source Link

Document

Add a archives to be localized

Usage

From source file:be.ugent.intec.halvade.MapReduceRunner.java

License:Open Source License

protected int runPass1RNAJob(Configuration pass1Conf, String tmpOutDir)
        throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
    HalvadeConf.setIsPass2(pass1Conf, false);
    HalvadeResourceManager.setJobResources(halvadeOpts, pass1Conf, HalvadeResourceManager.RNA_SHMEM_PASS1, true,
            halvadeOpts.useBamInput);//w w w . j  a v a 2 s  . co  m
    Job pass1Job = Job.getInstance(pass1Conf, "Halvade pass 1 RNA pipeline");
    pass1Job.addCacheArchive(new URI(halvadeOpts.halvadeBinaries));
    pass1Job.setJarByClass(be.ugent.intec.halvade.hadoop.mapreduce.HalvadeMapper.class);
    FileSystem fs = FileSystem.get(new URI(halvadeOpts.in), pass1Conf);
    try {
        if (fs.getFileStatus(new Path(halvadeOpts.in)).isDirectory()) {
            // add every file in directory
            FileStatus[] files = fs.listStatus(new Path(halvadeOpts.in));
            for (FileStatus file : files) {
                if (!file.isDirectory()) {
                    FileInputFormat.addInputPath(pass1Job, file.getPath());
                }
            }
        } else {
            FileInputFormat.addInputPath(pass1Job, new Path(halvadeOpts.in));
        }
    } catch (IOException | IllegalArgumentException e) {
        Logger.EXCEPTION(e);
    }

    FileSystem outFs = FileSystem.get(new URI(tmpOutDir), pass1Conf);
    boolean skipPass1 = false;
    if (outFs.exists(new Path(tmpOutDir))) {
        // check if genome already exists
        skipPass1 = outFs.exists(new Path(tmpOutDir + "/_SUCCESS"));
        if (skipPass1)
            Logger.DEBUG("pass1 genome already created, skipping pass 1");
        else {
            Logger.INFO("The output directory \'" + tmpOutDir + "\' already exists.");
            Logger.INFO("ERROR: Please remove this directory before trying again.");
            System.exit(-2);
        }
    }
    if (!skipPass1) {
        FileOutputFormat.setOutputPath(pass1Job, new Path(tmpOutDir));
        pass1Job.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.StarAlignPassXMapper.class);

        pass1Job.setInputFormatClass(HalvadeTextInputFormat.class);
        pass1Job.setMapOutputKeyClass(GenomeSJ.class);
        pass1Job.setMapOutputValueClass(Text.class);

        pass1Job.setSortComparatorClass(GenomeSJSortComparator.class);
        pass1Job.setGroupingComparatorClass(GenomeSJGroupingComparator.class);
        pass1Job.setNumReduceTasks(1);
        pass1Job.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.RebuildStarGenomeReducer.class);
        pass1Job.setOutputKeyClass(LongWritable.class);
        pass1Job.setOutputValueClass(Text.class);

        return runTimedJob(pass1Job, "Halvade pass 1 Job");
    } else
        return 0;
}

From source file:be.ugent.intec.halvade.MapReduceRunner.java

License:Open Source License

protected int runHalvadeJob(Configuration halvadeConf, String tmpOutDir, int jobType)
        throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
    String pipeline = "";
    if (jobType == HalvadeResourceManager.RNA_SHMEM_PASS2) {
        HalvadeConf.setIsPass2(halvadeConf, true);
        HalvadeResourceManager.setJobResources(halvadeOpts, halvadeConf, jobType, false,
                halvadeOpts.useBamInput);
        pipeline = RNA_PASS2;//from   w w  w. j ava2  s  .  c om
    } else if (jobType == HalvadeResourceManager.DNA) {
        HalvadeResourceManager.setJobResources(halvadeOpts, halvadeConf, jobType, false,
                halvadeOpts.useBamInput);
        pipeline = DNA;
    }
    HalvadeConf.setOutDir(halvadeConf, tmpOutDir);
    FileSystem outFs = FileSystem.get(new URI(tmpOutDir), halvadeConf);
    if (outFs.exists(new Path(tmpOutDir))) {
        Logger.INFO("The output directory \'" + tmpOutDir + "\' already exists.");
        Logger.INFO("ERROR: Please remove this directory before trying again.");
        System.exit(-2);
    }
    if (halvadeOpts.useBamInput)
        setHeaderFile(halvadeOpts.in, halvadeConf);

    Job halvadeJob = Job.getInstance(halvadeConf, "Halvade" + pipeline);
    halvadeJob.addCacheArchive(new URI(halvadeOpts.halvadeBinaries));
    halvadeJob.setJarByClass(be.ugent.intec.halvade.hadoop.mapreduce.HalvadeMapper.class);
    addInputFiles(halvadeOpts.in, halvadeConf, halvadeJob);
    FileOutputFormat.setOutputPath(halvadeJob, new Path(tmpOutDir));

    if (jobType == HalvadeResourceManager.RNA_SHMEM_PASS2) {
        halvadeJob.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.StarAlignPassXMapper.class);
        halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.RnaGATKReducer.class);
    } else if (jobType == HalvadeResourceManager.DNA) {
        halvadeJob.setMapperClass(halvadeOpts.alignmentTools[halvadeOpts.aln]);
        halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.DnaGATKReducer.class);
    }

    halvadeJob.setMapOutputKeyClass(ChromosomeRegion.class);
    halvadeJob.setMapOutputValueClass(SAMRecordWritable.class);
    halvadeJob.setInputFormatClass(HalvadeTextInputFormat.class);
    halvadeJob.setOutputKeyClass(Text.class);
    if (halvadeOpts.mergeBam) {
        halvadeJob.setSortComparatorClass(SimpleChrRegionComparator.class);
        halvadeJob.setOutputValueClass(SAMRecordWritable.class);
    } else {
        halvadeJob.setPartitionerClass(ChrRgPartitioner.class);
        halvadeJob.setSortComparatorClass(ChrRgSortComparator.class);
        halvadeJob.setGroupingComparatorClass(ChrRgGroupingComparator.class);
        halvadeJob.setOutputValueClass(VariantContextWritable.class);
    }

    if (halvadeOpts.justAlign)
        halvadeJob.setNumReduceTasks(0);
    else if (halvadeOpts.mergeBam) {
        halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.BamMergeReducer.class);
        halvadeJob.setNumReduceTasks(1);
    } else
        halvadeJob.setNumReduceTasks(halvadeOpts.reduces);

    if (halvadeOpts.useBamInput) {
        halvadeJob.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.AlignedBamMapper.class);
        halvadeJob.setInputFormatClass(BAMInputFormat.class);
    }

    return runTimedJob(halvadeJob, "Halvade Job");
}

From source file:co.cask.cdap.internal.app.runtime.batch.MapReduceRuntimeService.java

License:Apache License

@Override
protected void startUp() throws Exception {
    // Creates a temporary directory locally for storing all generated files.
    File tempDir = createTempDirectory();
    cleanupTask = createCleanupTask(tempDir);

    try {/*from  ww w .j a v a  2 s .  co  m*/
        Job job = createJob(new File(tempDir, "mapreduce"));
        Configuration mapredConf = job.getConfiguration();

        classLoader = new MapReduceClassLoader(injector, cConf, mapredConf,
                context.getProgram().getClassLoader(), context.getPlugins(), context.getPluginInstantiator());
        cleanupTask = createCleanupTask(cleanupTask, classLoader);

        mapredConf.setClassLoader(new WeakReferenceDelegatorClassLoader(classLoader));
        ClassLoaders.setContextClassLoader(mapredConf.getClassLoader());

        context.setJob(job);

        beforeSubmit(job);

        // Localize additional resources that users have requested via BasicMapReduceContext.localize methods
        Map<String, String> localizedUserResources = localizeUserResources(job, tempDir);

        // Override user-defined job name, since we set it and depend on the name.
        // https://issues.cask.co/browse/CDAP-2441
        String jobName = job.getJobName();
        if (!jobName.isEmpty()) {
            LOG.warn("Job name {} is being overridden.", jobName);
        }
        job.setJobName(getJobName(context));

        // Create a temporary location for storing all generated files through the LocationFactory.
        Location tempLocation = createTempLocationDirectory();
        cleanupTask = createCleanupTask(cleanupTask, tempLocation);

        // For local mode, everything is in the configuration classloader already, hence no need to create new jar
        if (!MapReduceTaskContextProvider.isLocal(mapredConf)) {
            // After calling beforeSubmit, we know what plugins are needed for the program, hence construct the proper
            // ClassLoader from here and use it for setting up the job
            Location pluginArchive = createPluginArchive(tempLocation);
            if (pluginArchive != null) {
                job.addCacheArchive(pluginArchive.toURI());
                mapredConf.set(Constants.Plugin.ARCHIVE, pluginArchive.getName());
            }
        }

        // set resources for the job
        TaskType.MAP.setResources(mapredConf, context.getMapperResources());
        TaskType.REDUCE.setResources(mapredConf, context.getReducerResources());

        // replace user's Mapper & Reducer's with our wrappers in job config
        MapperWrapper.wrap(job);
        ReducerWrapper.wrap(job);

        // packaging job jar which includes cdap classes with dependencies
        File jobJar = buildJobJar(job, tempDir);
        job.setJar(jobJar.toURI().toString());

        Location programJar = programJarLocation;
        if (!MapReduceTaskContextProvider.isLocal(mapredConf)) {
            // Copy and localize the program jar in distributed mode
            programJar = copyProgramJar(tempLocation);
            job.addCacheFile(programJar.toURI());

            List<String> classpath = new ArrayList<>();

            // Localize logback.xml
            Location logbackLocation = createLogbackJar(tempLocation);
            if (logbackLocation != null) {
                job.addCacheFile(logbackLocation.toURI());
                classpath.add(logbackLocation.getName());
            }

            // Generate and localize the launcher jar to control the classloader of MapReduce containers processes
            classpath.add("job.jar/lib/*");
            classpath.add("job.jar/classes");
            Location launcherJar = createLauncherJar(
                    Joiner.on(",").join(MapReduceContainerHelper.getMapReduceClassPath(mapredConf, classpath)),
                    tempLocation);
            job.addCacheFile(launcherJar.toURI());

            // The only thing in the container classpath is the launcher.jar
            // The MapReduceContainerLauncher inside the launcher.jar will creates a MapReduceClassLoader and launch
            // the actual MapReduce AM/Task from that
            // We explicitly localize the mr-framwork, but not use it with the classpath
            URI frameworkURI = MapReduceContainerHelper.getFrameworkURI(mapredConf);
            if (frameworkURI != null) {
                job.addCacheArchive(frameworkURI);
            }

            mapredConf.unset(MRJobConfig.MAPREDUCE_APPLICATION_FRAMEWORK_PATH);
            mapredConf.set(MRJobConfig.MAPREDUCE_APPLICATION_CLASSPATH, launcherJar.getName());
            mapredConf.set(YarnConfiguration.YARN_APPLICATION_CLASSPATH, launcherJar.getName());
        }

        MapReduceContextConfig contextConfig = new MapReduceContextConfig(mapredConf);
        // We start long-running tx to be used by mapreduce job tasks.
        Transaction tx = txClient.startLong();
        try {
            // We remember tx, so that we can re-use it in mapreduce tasks
            CConfiguration cConfCopy = cConf;
            contextConfig.set(context, cConfCopy, tx, programJar.toURI(), localizedUserResources);

            LOG.info("Submitting MapReduce Job: {}", context);
            // submits job and returns immediately. Shouldn't need to set context ClassLoader.
            job.submit();

            this.job = job;
            this.transaction = tx;
        } catch (Throwable t) {
            Transactions.invalidateQuietly(txClient, tx);
            throw t;
        }
    } catch (Throwable t) {
        LOG.error("Exception when submitting MapReduce Job: {}", context, t);
        cleanupTask.run();
        throw t;
    }
}

From source file:co.cask.cdap.internal.app.runtime.batch.MapReduceRuntimeService.java

License:Apache License

/**
 * Localizes resources requested by users in the MapReduce Program's beforeSubmit phase.
 * In Local mode, also copies resources to a temporary directory.
 *
 * @param job the {@link Job} for this MapReduce program
 * @param targetDir in local mode, a temporary directory to copy the resources to
 * @return a {@link Map} of resource name to the resource path. The resource path will be absolute in local mode,
 * while it will just contain the file name in distributed mode.
 *//*from  w w  w.j  av  a 2s  .  c o m*/
private Map<String, String> localizeUserResources(Job job, File targetDir) throws IOException {
    Map<String, String> localizedResources = new HashMap<>();
    Map<String, LocalizeResource> resourcesToLocalize = context.getResourcesToLocalize();
    for (Map.Entry<String, LocalizeResource> entry : resourcesToLocalize.entrySet()) {
        String localizedFilePath;
        String name = entry.getKey();
        Configuration mapredConf = job.getConfiguration();
        if (MapReduceTaskContextProvider.isLocal(mapredConf)) {
            // in local mode, also add localize resources in a temporary directory
            localizedFilePath = LocalizationUtils.localizeResource(entry.getKey(), entry.getValue(), targetDir)
                    .getAbsolutePath();
        } else {
            URI uri = entry.getValue().getURI();
            // in distributed mode, use the MapReduce Job object to localize resources
            URI actualURI;
            try {
                actualURI = new URI(uri.getScheme(), uri.getAuthority(), uri.getPath(), uri.getQuery(), name);
            } catch (URISyntaxException e) {
                // Most of the URI is constructed from the passed URI. So ideally, this should not happen.
                // If it does though, there is nothing that clients can do to recover, so not propagating a checked exception.
                throw Throwables.propagate(e);
            }
            if (entry.getValue().isArchive()) {
                job.addCacheArchive(actualURI);
            } else {
                job.addCacheFile(actualURI);
            }
            localizedFilePath = name;
        }
        localizedResources.put(name, localizedFilePath);
    }
    return localizedResources;
}

From source file:com.architecting.ch07.MapReduceIndexerTool.java

License:Apache License

/** API for Java clients;visible for testing;may become a public API eventually */
int run(Options options) throws Exception {
    if (getConf().getBoolean("isMR1", false) && "local".equals(getConf().get("mapred.job.tracker"))) {
        throw new IllegalStateException(
                "Running with LocalJobRunner (i.e. all of Hadoop inside a single JVM) is not supported "
                        + "because LocalJobRunner does not (yet) implement the Hadoop Distributed Cache feature, "
                        + "which is required for passing files via --files and --libjars");
    }/*from www  . j  a  va 2s  .  c o m*/

    long programStartTime = System.nanoTime();
    getConf().setInt(SolrOutputFormat.SOLR_RECORD_WRITER_MAX_SEGMENTS, options.maxSegments);

    // switch off a false warning about allegedly not implementing Tool
    // also see http://hadoop.6.n7.nabble.com/GenericOptionsParser-warning-td8103.html
    // also see https://issues.apache.org/jira/browse/HADOOP-8183
    getConf().setBoolean("mapred.used.genericoptionsparser", true);

    if (options.log4jConfigFile != null) {
        Utils.setLogConfigFile(options.log4jConfigFile, getConf());
        addDistributedCacheFile(options.log4jConfigFile, getConf());
    }

    Configuration config = HBaseConfiguration.create();
    Job job = Job.getInstance(config);
    job.setJarByClass(getClass());

    // To be able to run this example from eclipse, we need to make sure 
    // the built jar is distributed to the map-reduce tasks from the
    // local file system.
    job.addCacheArchive(new URI("file:///home/cloudera/ahae/target/ahae.jar"));

    FileSystem fs = options.outputDir.getFileSystem(job.getConfiguration());
    if (fs.exists(options.outputDir) && !delete(options.outputDir, true, fs)) {
        return -1;
    }
    Path outputResultsDir = new Path(options.outputDir, RESULTS_DIR);
    Path outputReduceDir = new Path(options.outputDir, "reducers");

    int reducers = 1;

    Scan scan = new Scan();
    scan.addFamily(CF);
    // tag::SETUP[]
    scan.setCaching(500); // <1>
    scan.setCacheBlocks(false); // <2>

    TableMapReduceUtil.initTableMapperJob( // <3>
            options.inputTable, // Input HBase table name
            scan, // Scan instance to control what to index
            HBaseAvroToSOLRMapper.class, // Mapper to parse cells content.
            Text.class, // Mapper output key
            SolrInputDocumentWritable.class, // Mapper output value
            job);

    FileOutputFormat.setOutputPath(job, outputReduceDir);

    job.setJobName(getClass().getName() + "/" + Utils.getShortClassName(HBaseAvroToSOLRMapper.class));
    job.setReducerClass(SolrReducer.class); // <4>
    job.setPartitionerClass(SolrCloudPartitioner.class); // <5>
    job.getConfiguration().set(SolrCloudPartitioner.ZKHOST, options.zkHost);
    job.getConfiguration().set(SolrCloudPartitioner.COLLECTION, options.collection);
    job.getConfiguration().setInt(SolrCloudPartitioner.SHARDS, options.shards);

    job.setOutputFormatClass(SolrOutputFormat.class);
    SolrOutputFormat.setupSolrHomeCache(options.solrHomeDir, job);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(SolrInputDocumentWritable.class);
    job.setSpeculativeExecution(false);
    // end::SETUP[]
    job.setNumReduceTasks(reducers); // Set the number of reducers based on the number of shards we have.
    if (!waitForCompletion(job, true)) {
        return -1;// job failed
    }

    // -------------------------------------------------------------------------------------------------------------------------------------

    assert reducers == options.shards;

    // normalize output shard dir prefix, i.e.
    // rename part-r-00000 to part-00000 (stems from zero tree merge iterations)
    // rename part-m-00000 to part-00000 (stems from > 0 tree merge iterations)
    for (FileStatus stats : fs.listStatus(outputReduceDir)) {
        String dirPrefix = SolrOutputFormat.getOutputName(job);
        Path srcPath = stats.getPath();
        if (stats.isDirectory() && srcPath.getName().startsWith(dirPrefix)) {
            String dstName = dirPrefix + srcPath.getName().substring(dirPrefix.length() + "-m".length());
            Path dstPath = new Path(srcPath.getParent(), dstName);
            if (!rename(srcPath, dstPath, fs)) {
                return -1;
            }
        }
    }
    ;

    // publish results dir
    if (!rename(outputReduceDir, outputResultsDir, fs)) {
        return -1;
    }

    if (options.goLive && !new GoLive().goLive(options, listSortedOutputShardDirs(job, outputResultsDir, fs))) {
        return -1;
    }

    goodbye(job, programStartTime);
    return 0;
}

From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java

License:Apache License

private void addDepsJarToDistributedCache(Path path, Job job) throws IOException {
    LOGGER.info("Trying to add all the deps jar files from directory: {}", path);
    FileSystem fs = FileSystem.get(getConf());
    FileStatus[] fileStatusArr = fs.listStatus(path);
    for (FileStatus fileStatus : fileStatusArr) {
        if (fileStatus.isDirectory()) {
            addDepsJarToDistributedCache(fileStatus.getPath(), job);
        } else {/*from  w  w  w .  jav a 2  s.c  o m*/
            Path depJarPath = fileStatus.getPath();
            if (depJarPath.getName().endsWith(".jar")) {
                LOGGER.info("Adding deps jar files: {}", path);
                job.addCacheArchive(path.toUri());
            }
        }
    }
}

From source file:gov.nasa.jpl.memex.pooledtimeseries.MeanChiSquareDistanceCalculation.java

License:Apache License

public static void main(String[] args) throws Exception {
    System.loadLibrary(Core.NATIVE_LIBRARY_NAME);

    Configuration baseConf = new Configuration();
    baseConf.set("mapreduce.job.maps", "96");
    baseConf.set("mapred.tasktracker.map.tasks.maximum", "96");

    JobConf conf = new JobConf();
    System.out.println("Before Map:" + conf.getNumMapTasks());
    conf.setNumMapTasks(96);/*from  ww  w  . j a va2  s  . com*/
    System.out.println("After Map:" + conf.getNumMapTasks());

    Job job = Job.getInstance(baseConf);
    job.setJarByClass(MeanChiSquareDistanceCalculation.class);

    job.setJobName("mean_chi_square_calculation");
    System.out.println("Job ID" + job.getJobID());
    System.out.println("Track:" + baseConf.get("mapred.job.tracker"));
    System.out.println("Job Name" + job.getJobName());
    System.out.println(baseConf.get("mapreduce.job.maps"));
    System.out.println("Caching video-metric-bak.tgz");
    job.addCacheArchive(new URI("/user/pts/video-metric-bak.tgz"));
    URI[] cacheFiles = job.getCacheFiles();
    if (cacheFiles != null && cacheFiles.length > 0) {
        System.out.println("Cache file ->" + cacheFiles[0]);
    }
    System.out.println("Cached video-metric-bak.tgz");

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.waitForCompletion(true);

}