Example usage for org.apache.hadoop.mapreduce.lib.chain ChainMapper addMapper

List of usage examples for org.apache.hadoop.mapreduce.lib.chain ChainMapper addMapper

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.chain ChainMapper addMapper.

Prototype

public static void addMapper(Job job, Class<? extends Mapper> klass, Class<?> inputKeyClass,
        Class<?> inputValueClass, Class<?> outputKeyClass, Class<?> outputValueClass, Configuration mapperConf)
        throws IOException 

Source Link

Document

Adds a Mapper class to the chain mapper.

Usage

From source file:com.zinnia.nectar.regression.hadoop.primitive.jobs.YDiffJob.java

License:Apache License

public Double call() throws Exception {
    JobControl jobControl = new JobControl("YDiff job");

    Job job = new Job();
    job.setJarByClass(YDiffJob.class);

    ChainMapper.addMapper(job, FieldSeperator.FieldSeperationMapper.class, DoubleWritable.class, Text.class,
            NullWritable.class, Text.class, job.getConfiguration());
    ChainMapper.addMapper(job, YDiffMapper.class, NullWritable.class, Text.class, Text.class,
            DoubleWritable.class, job.getConfiguration());

    String fieldSpec = getFieldSpecForColumns();
    job.getConfiguration().set("fields.spec", fieldSpec);
    job.getConfiguration().setStrings("paramValues", paramValues);
    job.setReducerClass(DoubleSumReducer.class);
    FileInputFormat.addInputPath(job, new Path(inputFilePath));
    FileOutputFormat.setOutputPath(job, new Path(outputFilePath));
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);
    job.setInputFormatClass(TextInputFormat.class);

    ControlledJob controlledJob = new ControlledJob(job.getConfiguration());
    jobControl.addJob(controlledJob);/* ww  w  .  ja  v a 2 s  .  c o m*/
    Thread thread = new Thread(jobControl);
    thread.start();
    while (!jobControl.allFinished()) {
        Thread.sleep(10000);
    }
    jobControl.stop();
    FileSystem fs = FileSystem.get(job.getConfiguration());
    FSDataInputStream in = fs.open(new Path(outputFilePath + "/part-r-00000"));
    BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(in));
    String valueLine = bufferedReader.readLine();
    String[] fields = valueLine.split("\t");
    double value = Double.parseDouble(fields[1]);
    bufferedReader.close();
    in.close();
    return value;
}

From source file:fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.FilterAndMapReadsHadoopModule.java

License:LGPL

private Job createJobConf(final Configuration parentConf, final TaskContext context, final String dataName,
        final DataFile inFile, final List<String> filenames, final boolean pairedEnd,
        final DataFormat inputFormat, final FastqFormat fastqFormat, final DataFile genomeIndexFile,
        final DataFile outFile) throws IOException {

    final Configuration jobConf = new Configuration(parentConf);

    // Set input path
    final Path inputPath = new Path(inFile.getSource());

    // Set counter group
    jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, getCounterGroup());

    ////ww w. j ava  2s.co m
    // Reads filters parameters
    //

    // Set fastq format
    jobConf.set(ReadsFilterMapper.FASTQ_FORMAT_KEY, fastqFormat.getName());

    // Set read filter parameters
    addParametersToJobConf(getReadFilterParameters(), READ_FILTER_PARAMETER_KEY_PREFIX, jobConf);

    //
    // Reads mapping parameters
    //

    // Set mapper name
    jobConf.set(ReadsMapperMapper.MAPPER_NAME_KEY, getMapperName());

    // Set mapper version
    jobConf.set(ReadsMapperMapper.MAPPER_VERSION_KEY, getMapperVersion());

    // Set mapper flavor
    jobConf.set(ReadsMapperMapper.MAPPER_FLAVOR_KEY, getMapperFlavor());

    // Set pair end or single end mode
    jobConf.set(ReadsMapperMapper.PAIR_END_KEY, Boolean.toString(pairedEnd));

    // Set the number of threads for the mapper
    if (getMapperHadoopThreads() < 0) {
        jobConf.set(ReadsMapperMapper.MAPPER_THREADS_KEY, "" + getMapperHadoopThreads());
    }

    // Set mapper arguments
    if (getMapperArguments() != null) {
        jobConf.set(ReadsMapperMapper.MAPPER_ARGS_KEY, getMapperArguments());
    }

    // Set Mapper fastq format
    jobConf.set(ReadsMapperMapper.FASTQ_FORMAT_KEY, "" + fastqFormat);

    // Set mapper index checksum
    jobConf.set(ReadsMapperMapper.INDEX_CHECKSUM_KEY, "" + computeZipCheckSum(genomeIndexFile, parentConf));

    // timeout
    jobConf.set("mapreduce.task.timeout", "" + HADOOP_TIMEOUT);

    // Don't reuse JVM
    jobConf.set("mapreduce.job.jvm.numtasks", "" + 1);

    // Set the memory required by the reads mapper
    jobConf.set("mapreduce.map.memory.mb", "" + getMapperHadoopMemoryRequired());

    // Set the memory required by JVM (BWA need more memory than the other
    // mapper for buffering named pipes)
    jobConf.set("mapreduce.map.java.opts", "-Xmx4096M");

    // Set ZooKeeper client configuration
    setZooKeeperJobConfiguration(jobConf, context);

    //
    // Alignment filtering
    //

    // Set SAM filter parameters
    addParametersToJobConf(getAlignmentsFilterParameters(), MAP_FILTER_PARAMETER_KEY_PREFIX, jobConf);

    //
    // Job creation
    //

    // Create the job and its name
    final Job job = Job.getInstance(jobConf,
            "Filter and map reads (" + dataName + ", " + Joiner.on(", ").join(filenames) + ")");

    // Set the jar
    job.setJarByClass(ReadsFilterHadoopModule.class);

    // Set input path
    FileInputFormat.addInputPath(job, inputPath);

    // Add genome mapper index to distributed cache

    // Set genome index reference path in the distributed cache
    final Path genomeIndex = new Path(genomeIndexFile.getSource());
    job.addCacheFile(genomeIndex.toUri());

    // Set the input format
    if (inputFormat == READS_FASTQ) {
        job.setInputFormatClass(FastqInputFormat.class);
    } else {
        job.setInputFormatClass(KeyValueTextInputFormat.class);
    }

    // Set the Mappers classes using a chain mapper
    ChainMapper.addMapper(job, ReadsFilterMapper.class, Text.class, Text.class, Text.class, Text.class,
            jobConf);
    ChainMapper.addMapper(job, ReadsMapperMapper.class, Text.class, Text.class, Text.class, Text.class,
            jobConf);
    ChainMapper.addMapper(job, SAMFilterMapper.class, Text.class, Text.class, Text.class, Text.class, jobConf);

    // Set the reducer class
    job.setReducerClass(SAMFilterReducer.class);

    // Set the output format
    job.setOutputFormatClass(SAMOutputFormat.class);

    // Set the output key class
    job.setOutputKeyClass(Text.class);

    // Set the output value class
    job.setOutputValueClass(Text.class);

    // Set output path
    FileOutputFormat.setOutputPath(job, new Path(outFile.getSource()));

    return job;
}

From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java

License:Apache License

public static Job getTripleGraphSizesJob(Configuration config, String[] inputPaths, String outputPath)
        throws IOException {
    Job job = Job.getInstance(config);//from   ww w  .ja v a  2  s .  co m
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Triples Graph Sizes");

    // Map/Reduce classes
    ChainMapper.addMapper(job, TriplesToQuadsConstantGraphMapper.class, LongWritable.class,
            TripleWritable.class, LongWritable.class, QuadWritable.class, config);
    ChainMapper.addMapper(job, QuadGraphCountMapper.class, LongWritable.class, QuadWritable.class,
            NodeWritable.class, LongWritable.class, config);
    job.setMapOutputKeyClass(NodeWritable.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setReducerClass(NodeCountReducer.class);

    // Input and Output
    job.setInputFormatClass(TriplesInputFormat.class);
    job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
    FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job;
}