Example usage for org.apache.hadoop.mapreduce Job addCacheArchive

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job addCacheArchive.

Prototype

public void addCacheArchive(URI uri)

Source Link

Document

Add a archives to be localized

Usage

From source file:be.ugent.intec.halvade.MapReduceRunner.java

License:Open Source License

protected int runPass1RNAJob(Configuration pass1Conf, String tmpOutDir)
        throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
    HalvadeConf.setIsPass2(pass1Conf, false);
    HalvadeResourceManager.setJobResources(halvadeOpts, pass1Conf, HalvadeResourceManager.RNA_SHMEM_PASS1, true,
            halvadeOpts.useBamInput);//w w w . j  a v a 2 s  . co  m
    Job pass1Job = Job.getInstance(pass1Conf, "Halvade pass 1 RNA pipeline");
    pass1Job.addCacheArchive(new URI(halvadeOpts.halvadeBinaries));
    pass1Job.setJarByClass(be.ugent.intec.halvade.hadoop.mapreduce.HalvadeMapper.class);
    FileSystem fs = FileSystem.get(new URI(halvadeOpts.in), pass1Conf);
    try {
        if (fs.getFileStatus(new Path(halvadeOpts.in)).isDirectory()) {
            // add every file in directory
            FileStatus[] files = fs.listStatus(new Path(halvadeOpts.in));
            for (FileStatus file : files) {
                if (!file.isDirectory()) {
                    FileInputFormat.addInputPath(pass1Job, file.getPath());
                }
            }
        } else {
            FileInputFormat.addInputPath(pass1Job, new Path(halvadeOpts.in));
        }
    } catch (IOException | IllegalArgumentException e) {
        Logger.EXCEPTION(e);
    }

    FileSystem outFs = FileSystem.get(new URI(tmpOutDir), pass1Conf);
    boolean skipPass1 = false;
    if (outFs.exists(new Path(tmpOutDir))) {
        // check if genome already exists
        skipPass1 = outFs.exists(new Path(tmpOutDir + "/_SUCCESS"));
        if (skipPass1)
            Logger.DEBUG("pass1 genome already created, skipping pass 1");
        else {
            Logger.INFO("The output directory \'" + tmpOutDir + "\' already exists.");
            Logger.INFO("ERROR: Please remove this directory before trying again.");
            System.exit(-2);
        }
    }
    if (!skipPass1) {
        FileOutputFormat.setOutputPath(pass1Job, new Path(tmpOutDir));
        pass1Job.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.StarAlignPassXMapper.class);

        pass1Job.setInputFormatClass(HalvadeTextInputFormat.class);
        pass1Job.setMapOutputKeyClass(GenomeSJ.class);
        pass1Job.setMapOutputValueClass(Text.class);

        pass1Job.setSortComparatorClass(GenomeSJSortComparator.class);
        pass1Job.setGroupingComparatorClass(GenomeSJGroupingComparator.class);
        pass1Job.setNumReduceTasks(1);
        pass1Job.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.RebuildStarGenomeReducer.class);
        pass1Job.setOutputKeyClass(LongWritable.class);
        pass1Job.setOutputValueClass(Text.class);

        return runTimedJob(pass1Job, "Halvade pass 1 Job");
    } else
        return 0;
}

From source file:be.ugent.intec.halvade.MapReduceRunner.java

License:Open Source License

protected int runHalvadeJob(Configuration halvadeConf, String tmpOutDir, int jobType)
        throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
    String pipeline = "";
    if (jobType == HalvadeResourceManager.RNA_SHMEM_PASS2) {
        HalvadeConf.setIsPass2(halvadeConf, true);
        HalvadeResourceManager.setJobResources(halvadeOpts, halvadeConf, jobType, false,
                halvadeOpts.useBamInput);
        pipeline = RNA_PASS2;//from   w w  w. j ava2  s  .  c om
    } else if (jobType == HalvadeResourceManager.DNA) {
        HalvadeResourceManager.setJobResources(halvadeOpts, halvadeConf, jobType, false,
                halvadeOpts.useBamInput);
        pipeline = DNA;
    }
    HalvadeConf.setOutDir(halvadeConf, tmpOutDir);
    FileSystem outFs = FileSystem.get(new URI(tmpOutDir), halvadeConf);
    if (outFs.exists(new Path(tmpOutDir))) {
        Logger.INFO("The output directory \'" + tmpOutDir + "\' already exists.");
        Logger.INFO("ERROR: Please remove this directory before trying again.");
        System.exit(-2);
    }
    if (halvadeOpts.useBamInput)
        setHeaderFile(halvadeOpts.in, halvadeConf);

    Job halvadeJob = Job.getInstance(halvadeConf, "Halvade" + pipeline);
    halvadeJob.addCacheArchive(new URI(halvadeOpts.halvadeBinaries));
    halvadeJob.setJarByClass(be.ugent.intec.halvade.hadoop.mapreduce.HalvadeMapper.class);
    addInputFiles(halvadeOpts.in, halvadeConf, halvadeJob);
    FileOutputFormat.setOutputPath(halvadeJob, new Path(tmpOutDir));

    if (jobType == HalvadeResourceManager.RNA_SHMEM_PASS2) {
        halvadeJob.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.StarAlignPassXMapper.class);
        halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.RnaGATKReducer.class);
    } else if (jobType == HalvadeResourceManager.DNA) {
        halvadeJob.setMapperClass(halvadeOpts.alignmentTools[halvadeOpts.aln]);
        halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.DnaGATKReducer.class);
    }

    halvadeJob.setMapOutputKeyClass(ChromosomeRegion.class);
    halvadeJob.setMapOutputValueClass(SAMRecordWritable.class);
    halvadeJob.setInputFormatClass(HalvadeTextInputFormat.class);
    halvadeJob.setOutputKeyClass(Text.class);
    if (halvadeOpts.mergeBam) {
        halvadeJob.setSortComparatorClass(SimpleChrRegionComparator.class);
        halvadeJob.setOutputValueClass(SAMRecordWritable.class);
    } else {
        halvadeJob.setPartitionerClass(ChrRgPartitioner.class);
        halvadeJob.setSortComparatorClass(ChrRgSortComparator.class);
        halvadeJob.setGroupingComparatorClass(ChrRgGroupingComparator.class);
        halvadeJob.setOutputValueClass(VariantContextWritable.class);
    }

    if (halvadeOpts.justAlign)
        halvadeJob.setNumReduceTasks(0);
    else if (halvadeOpts.mergeBam) {
        halvadeJob.setReducerClass(be.ugent.intec.halvade.hadoop.mapreduce.BamMergeReducer.class);
        halvadeJob.setNumReduceTasks(1);
    } else
        halvadeJob.setNumReduceTasks(halvadeOpts.reduces);

    if (halvadeOpts.useBamInput) {
        halvadeJob.setMapperClass(be.ugent.intec.halvade.hadoop.mapreduce.AlignedBamMapper.class);
        halvadeJob.setInputFormatClass(BAMInputFormat.class);
    }

    return runTimedJob(halvadeJob, "Halvade Job");
}

From source file:co.cask.cdap.internal.app.runtime.batch.MapReduceRuntimeService.java

License:Apache License

@Override
protected void startUp() throws Exception {
    // Creates a temporary directory locally for storing all generated files.
    File tempDir = createTempDirectory();
    cleanupTask = createCleanupTask(tempDir);

    try {/*from  ww w .j a v a  2 s .  co  m*/
        Job job = createJob(new File(tempDir, "mapreduce"));
        Configuration mapredConf = job.getConfiguration();

        classLoader = new MapReduceClassLoader(injector, cConf, mapredConf,
                context.getProgram().getClassLoader(), context.getPlugins(), context.getPluginInstantiator());
        cleanupTask = createCleanupTask(cleanupTask, classLoader);

        mapredConf.setClassLoader(new WeakReferenceDelegatorClassLoader(classLoader));
        ClassLoaders.setContextClassLoader(mapredConf.getClassLoader());

        context.setJob(job);

        beforeSubmit(job);

        // Localize additional resources that users have requested via BasicMapReduceContext.localize methods
        Map<String, String> localizedUserResources = localizeUserResources(job, tempDir);

        // Override user-defined job name, since we set it and depend on the name.
        // https://issues.cask.co/browse/CDAP-2441
        String jobName = job.getJobName();
        if (!jobName.isEmpty()) {
            LOG.warn("Job name {} is being overridden.", jobName);
        }
        job.setJobName(getJobName(context));

        // Create a temporary location for storing all generated files through the LocationFactory.
        Location tempLocation = createTempLocationDirectory();
        cleanupTask = createCleanupTask(cleanupTask, tempLocation);

        // For local mode, everything is in the configuration classloader already, hence no need to create new jar
        if (!MapReduceTaskContextProvider.isLocal(mapredConf)) {
            // After calling beforeSubmit, we know what plugins are needed for the program, hence construct the proper
            // ClassLoader from here and use it for setting up the job
            Location pluginArchive = createPluginArchive(tempLocation);
            if (pluginArchive != null) {
                job.addCacheArchive(pluginArchive.toURI());
                mapredConf.set(Constants.Plugin.ARCHIVE, pluginArchive.getName());
            }
        }

        // set resources for the job
        TaskType.MAP.setResources(mapredConf, context.getMapperResources());
        TaskType.REDUCE.setResources(mapredConf, context.getReducerResources());

        // replace user's Mapper & Reducer's with our wrappers in job config
        MapperWrapper.wrap(job);
        ReducerWrapper.wrap(job);

        // packaging job jar which includes cdap classes with dependencies
        File jobJar = buildJobJar(job, tempDir);
        job.setJar(jobJar.toURI().toString());

        Location programJar = programJarLocation;
        if (!MapReduceTaskContextProvider.isLocal(mapredConf)) {
            // Copy and localize the program jar in distributed mode
            programJar = copyProgramJar(tempLocation);
            job.addCacheFile(programJar.toURI());

            List<String> classpath = new ArrayList<>();

            // Localize logback.xml
            Location logbackLocation = createLogbackJar(tempLocation);
            if (logbackLocation != null) {
                job.addCacheFile(logbackLocation.toURI());
                classpath.add(logbackLocation.getName());
            }

            // Generate and localize the launcher jar to control the classloader of MapReduce containers processes
            classpath.add("job.jar/lib/*");
            classpath.add("job.jar/classes");
            Location launcherJar = createLauncherJar(
                    Joiner.on(",").join(MapReduceContainerHelper.getMapReduceClassPath(mapredConf, classpath)),
                    tempLocation);
            job.addCacheFile(launcherJar.toURI());

            // The only thing in the container classpath is the launcher.jar
            // The MapReduceContainerLauncher inside the launcher.jar will creates a MapReduceClassLoader and launch
            // the actual MapReduce AM/Task from that
            // We explicitly localize the mr-framwork, but not use it with the classpath
            URI frameworkURI = MapReduceContainerHelper.getFrameworkURI(mapredConf);
            if (frameworkURI != null) {
                job.addCacheArchive(frameworkURI);
            }

            mapredConf.unset(MRJobConfig.MAPREDUCE_APPLICATION_FRAMEWORK_PATH);
            mapredConf.set(MRJobConfig.MAPREDUCE_APPLICATION_CLASSPATH, launcherJar.getName());
            mapredConf.set(YarnConfiguration.YARN_APPLICATION_CLASSPATH, launcherJar.getName());
        }

        MapReduceContextConfig contextConfig = new MapReduceContextConfig(mapredConf);
        // We start long-running tx to be used by mapreduce job tasks.
        Transaction tx = txClient.startLong();
        try {
            // We remember tx, so that we can re-use it in mapreduce tasks
            CConfiguration cConfCopy = cConf;
            contextConfig.set(context, cConfCopy, tx, programJar.toURI(), localizedUserResources);

            LOG.info("Submitting MapReduce Job: {}", context);
            // submits job and returns immediately. Shouldn't need to set context ClassLoader.
            job.submit();

            this.job = job;
            this.transaction = tx;
        } catch (Throwable t) {
            Transactions.invalidateQuietly(txClient, tx);
            throw t;
        }
    } catch (Throwable t) {
        LOG.error("Exception when submitting MapReduce Job: {}", context, t);
        cleanupTask.run();
        throw t;
    }
}

From source file:co.cask.cdap.internal.app.runtime.batch.MapReduceRuntimeService.java

License:Apache License

/**
 * Localizes resources requested by users in the MapReduce Program's beforeSubmit phase.
 * In Local mode, also copies resources to a temporary directory.
 *
 * @param job the {@link Job} for this MapReduce program
 * @param targetDir in local mode, a temporary directory to copy the resources to
 * @return a {@link Map} of resource name to the resource path. The resource path will be absolute in local mode,
 * while it will just contain the file name in distributed mode.
 *//*from  w w  w.j  av  a 2s  .  c o m*/
private Map<String, String> localizeUserResources(Job job, File targetDir) throws IOException {
    Map<String, String> localizedResources = new HashMap<>();
    Map<String, LocalizeResource> resourcesToLocalize = context.getResourcesToLocalize();
    for (Map.Entry<String, LocalizeResource> entry : resourcesToLocalize.entrySet()) {
        String localizedFilePath;
        String name = entry.getKey();
        Configuration mapredConf = job.getConfiguration();
        if (MapReduceTaskContextProvider.isLocal(mapredConf)) {
            // in local mode, also add localize resources in a temporary directory
            localizedFilePath = LocalizationUtils.localizeResource(entry.getKey(), entry.getValue(), targetDir)
                    .getAbsolutePath();
        } else {
            URI uri = entry.getValue().getURI();
            // in distributed mode, use the MapReduce Job object to localize resources
            URI actualURI;
            try {
                actualURI = new URI(uri.getScheme(), uri.getAuthority(), uri.getPath(), uri.getQuery(), name);
            } catch (URISyntaxException e) {
                // Most of the URI is constructed from the passed URI. So ideally, this should not happen.
                // If it does though, there is nothing that clients can do to recover, so not propagating a checked exception.
                throw Throwables.propagate(e);
            }
            if (entry.getValue().isArchive()) {
                job.addCacheArchive(actualURI);
            } else {
                job.addCacheFile(actualURI);
            }
            localizedFilePath = name;
        }
        localizedResources.put(name, localizedFilePath);
    }
    return localizedResources;
}

From source file:com.architecting.ch07.MapReduceIndexerTool.java

License:Apache License

/** API for Java clients;visible for testing;may become a public API eventually */
int run(Options options) throws Exception {
    if (getConf().getBoolean("isMR1", false) && "local".equals(getConf().get("mapred.job.tracker"))) {
        throw new IllegalStateException(
                "Running with LocalJobRunner (i.e. all of Hadoop inside a single JVM) is not supported "
                        + "because LocalJobRunner does not (yet) implement the Hadoop Distributed Cache feature, "
                        + "which is required for passing files via --files and --libjars");
    }/*from www  . j  a  va 2s  .  c o m*/

    long programStartTime = System.nanoTime();
    getConf().setInt(SolrOutputFormat.SOLR_RECORD_WRITER_MAX_SEGMENTS, options.maxSegments);

    // switch off a false warning about allegedly not implementing Tool
    // also see http://hadoop.6.n7.nabble.com/GenericOptionsParser-warning-td8103.html
    // also see https://issues.apache.org/jira/browse/HADOOP-8183
    getConf().setBoolean("mapred.used.genericoptionsparser", true);

    if (options.log4jConfigFile != null) {
        Utils.setLogConfigFile(options.log4jConfigFile, getConf());
        addDistributedCacheFile(options.log4jConfigFile, getConf());
    }

    Configuration config = HBaseConfiguration.create();
    Job job = Job.getInstance(config);
    job.setJarByClass(getClass());

    // To be able to run this example from eclipse, we need to make sure 
    // the built jar is distributed to the map-reduce tasks from the
    // local file system.
    job.addCacheArchive(new URI("file:///home/cloudera/ahae/target/ahae.jar"));

    FileSystem fs = options.outputDir.getFileSystem(job.getConfiguration());
    if (fs.exists(options.outputDir) && !delete(options.outputDir, true, fs)) {
        return -1;
    }
    Path outputResultsDir = new Path(options.outputDir, RESULTS_DIR);
    Path outputReduceDir = new Path(options.outputDir, "reducers");

    int reducers = 1;

    Scan scan = new Scan();
    scan.addFamily(CF);
    // tag::SETUP[]
    scan.setCaching(500); // <1>
    scan.setCacheBlocks(false); // <2>

    TableMapReduceUtil.initTableMapperJob( // <3>
            options.inputTable, // Input HBase table name
            scan, // Scan instance to control what to index
            HBaseAvroToSOLRMapper.class, // Mapper to parse cells content.
            Text.class, // Mapper output key
            SolrInputDocumentWritable.class, // Mapper output value
            job);

    FileOutputFormat.setOutputPath(job, outputReduceDir);

    job.setJobName(getClass().getName() + "/" + Utils.getShortClassName(HBaseAvroToSOLRMapper.class));
    job.setReducerClass(SolrReducer.class); // <4>
    job.setPartitionerClass(SolrCloudPartitioner.class); // <5>
    job.getConfiguration().set(SolrCloudPartitioner.ZKHOST, options.zkHost);
    job.getConfiguration().set(SolrCloudPartitioner.COLLECTION, options.collection);
    job.getConfiguration().setInt(SolrCloudPartitioner.SHARDS, options.shards);

    job.setOutputFormatClass(SolrOutputFormat.class);
    SolrOutputFormat.setupSolrHomeCache(options.solrHomeDir, job);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(SolrInputDocumentWritable.class);
    job.setSpeculativeExecution(false);
    // end::SETUP[]
    job.setNumReduceTasks(reducers); // Set the number of reducers based on the number of shards we have.
    if (!waitForCompletion(job, true)) {
        return -1;// job failed
    }

    // -------------------------------------------------------------------------------------------------------------------------------------

    assert reducers == options.shards;

    // normalize output shard dir prefix, i.e.
    // rename part-r-00000 to part-00000 (stems from zero tree merge iterations)
    // rename part-m-00000 to part-00000 (stems from > 0 tree merge iterations)
    for (FileStatus stats : fs.listStatus(outputReduceDir)) {
        String dirPrefix = SolrOutputFormat.getOutputName(job);
        Path srcPath = stats.getPath();
        if (stats.isDirectory() && srcPath.getName().startsWith(dirPrefix)) {
            String dstName = dirPrefix + srcPath.getName().substring(dirPrefix.length() + "-m".length());
            Path dstPath = new Path(srcPath.getParent(), dstName);
            if (!rename(srcPath, dstPath, fs)) {
                return -1;
            }
        }
    }
    ;

    // publish results dir
    if (!rename(outputReduceDir, outputResultsDir, fs)) {
        return -1;
    }

    if (options.goLive && !new GoLive().goLive(options, listSortedOutputShardDirs(job, outputResultsDir, fs))) {
        return -1;
    }

    goodbye(job, programStartTime);
    return 0;
}

From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java

License:Apache License

private void addDepsJarToDistributedCache(Path path, Job job) throws IOException {
    LOGGER.info("Trying to add all the deps jar files from directory: {}", path);
    FileSystem fs = FileSystem.get(getConf());
    FileStatus[] fileStatusArr = fs.listStatus(path);
    for (FileStatus fileStatus : fileStatusArr) {
        if (fileStatus.isDirectory()) {
            addDepsJarToDistributedCache(fileStatus.getPath(), job);
        } else {/*from  w  w  w .  jav a 2  s.c  o m*/
            Path depJarPath = fileStatus.getPath();
            if (depJarPath.getName().endsWith(".jar")) {
                LOGGER.info("Adding deps jar files: {}", path);
                job.addCacheArchive(path.toUri());
            }
        }
    }
}

From source file:gov.nasa.jpl.memex.pooledtimeseries.MeanChiSquareDistanceCalculation.java

License:Apache License

public static void main(String[] args) throws Exception {
    System.loadLibrary(Core.NATIVE_LIBRARY_NAME);

    Configuration baseConf = new Configuration();
    baseConf.set("mapreduce.job.maps", "96");
    baseConf.set("mapred.tasktracker.map.tasks.maximum", "96");

    JobConf conf = new JobConf();
    System.out.println("Before Map:" + conf.getNumMapTasks());
    conf.setNumMapTasks(96);/*from  ww  w  . j a va2  s  . com*/
    System.out.println("After Map:" + conf.getNumMapTasks());

    Job job = Job.getInstance(baseConf);
    job.setJarByClass(MeanChiSquareDistanceCalculation.class);

    job.setJobName("mean_chi_square_calculation");
    System.out.println("Job ID" + job.getJobID());
    System.out.println("Track:" + baseConf.get("mapred.job.tracker"));
    System.out.println("Job Name" + job.getJobName());
    System.out.println(baseConf.get("mapreduce.job.maps"));
    System.out.println("Caching video-metric-bak.tgz");
    job.addCacheArchive(new URI("/user/pts/video-metric-bak.tgz"));
    URI[] cacheFiles = job.getCacheFiles();
    if (cacheFiles != null && cacheFiles.length > 0) {
        System.out.println("Cache file ->" + cacheFiles[0]);
    }
    System.out.println("Cached video-metric-bak.tgz");

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(DoubleWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.waitForCompletion(true);

}