Example usage for org.apache.hadoop.mapreduce.lib.chain ChainMapper addMapper

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.chain ChainMapper addMapper.

Prototype

public static void addMapper(Job job, Class<? extends Mapper> klass, Class<?> inputKeyClass,
        Class<?> inputValueClass, Class<?> outputKeyClass, Class<?> outputValueClass, Configuration mapperConf)
        throws IOException

Source Link

Document

Adds a Mapper class to the chain mapper.

Usage

From source file:com.zinnia.nectar.regression.hadoop.primitive.jobs.YDiffJob.java

License:Apache License

public Double call() throws Exception {
    JobControl jobControl = new JobControl("YDiff job");

    Job job = new Job();
    job.setJarByClass(YDiffJob.class);

    ChainMapper.addMapper(job, FieldSeperator.FieldSeperationMapper.class, DoubleWritable.class, Text.class,
            NullWritable.class, Text.class, job.getConfiguration());
    ChainMapper.addMapper(job, YDiffMapper.class, NullWritable.class, Text.class, Text.class,
            DoubleWritable.class, job.getConfiguration());

    String fieldSpec = getFieldSpecForColumns();
    job.getConfiguration().set("fields.spec", fieldSpec);
    job.getConfiguration().setStrings("paramValues", paramValues);
    job.setReducerClass(DoubleSumReducer.class);
    FileInputFormat.addInputPath(job, new Path(inputFilePath));
    FileOutputFormat.setOutputPath(job, new Path(outputFilePath));
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);
    job.setInputFormatClass(TextInputFormat.class);

    ControlledJob controlledJob = new ControlledJob(job.getConfiguration());
    jobControl.addJob(controlledJob);/* ww  w  .  ja  v a 2 s  .  c o m*/
    Thread thread = new Thread(jobControl);
    thread.start();
    while (!jobControl.allFinished()) {
        Thread.sleep(10000);
    }
    jobControl.stop();
    FileSystem fs = FileSystem.get(job.getConfiguration());
    FSDataInputStream in = fs.open(new Path(outputFilePath + "/part-r-00000"));
    BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(in));
    String valueLine = bufferedReader.readLine();
    String[] fields = valueLine.split("\t");
    double value = Double.parseDouble(fields[1]);
    bufferedReader.close();
    in.close();
    return value;
}

From source file:fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.FilterAndMapReadsHadoopModule.java

License:LGPL

private Job createJobConf(final Configuration parentConf, final TaskContext context, final String dataName,
        final DataFile inFile, final List<String> filenames, final boolean pairedEnd,
        final DataFormat inputFormat, final FastqFormat fastqFormat, final DataFile genomeIndexFile,
        final DataFile outFile) throws IOException {

    final Configuration jobConf = new Configuration(parentConf);

    // Set input path
    final Path inputPath = new Path(inFile.getSource());

    // Set counter group
    jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, getCounterGroup());

    ////ww w. j ava  2s.co m
    // Reads filters parameters
    //

    // Set fastq format
    jobConf.set(ReadsFilterMapper.FASTQ_FORMAT_KEY, fastqFormat.getName());

    // Set read filter parameters
    addParametersToJobConf(getReadFilterParameters(), READ_FILTER_PARAMETER_KEY_PREFIX, jobConf);

    //
    // Reads mapping parameters
    //

    // Set mapper name
    jobConf.set(ReadsMapperMapper.MAPPER_NAME_KEY, getMapperName());

    // Set mapper version
    jobConf.set(ReadsMapperMapper.MAPPER_VERSION_KEY, getMapperVersion());

    // Set mapper flavor
    jobConf.set(ReadsMapperMapper.MAPPER_FLAVOR_KEY, getMapperFlavor());

    // Set pair end or single end mode
    jobConf.set(ReadsMapperMapper.PAIR_END_KEY, Boolean.toString(pairedEnd));

    // Set the number of threads for the mapper
    if (getMapperHadoopThreads() < 0) {
        jobConf.set(ReadsMapperMapper.MAPPER_THREADS_KEY, "" + getMapperHadoopThreads());
    }

    // Set mapper arguments
    if (getMapperArguments() != null) {
        jobConf.set(ReadsMapperMapper.MAPPER_ARGS_KEY, getMapperArguments());
    }

    // Set Mapper fastq format
    jobConf.set(ReadsMapperMapper.FASTQ_FORMAT_KEY, "" + fastqFormat);

    // Set mapper index checksum
    jobConf.set(ReadsMapperMapper.INDEX_CHECKSUM_KEY, "" + computeZipCheckSum(genomeIndexFile, parentConf));

    // timeout
    jobConf.set("mapreduce.task.timeout", "" + HADOOP_TIMEOUT);

    // Don't reuse JVM
    jobConf.set("mapreduce.job.jvm.numtasks", "" + 1);

    // Set the memory required by the reads mapper
    jobConf.set("mapreduce.map.memory.mb", "" + getMapperHadoopMemoryRequired());

    // Set the memory required by JVM (BWA need more memory than the other
    // mapper for buffering named pipes)
    jobConf.set("mapreduce.map.java.opts", "-Xmx4096M");

    // Set ZooKeeper client configuration
    setZooKeeperJobConfiguration(jobConf, context);

    //
    // Alignment filtering
    //

    // Set SAM filter parameters
    addParametersToJobConf(getAlignmentsFilterParameters(), MAP_FILTER_PARAMETER_KEY_PREFIX, jobConf);

    //
    // Job creation
    //

    // Create the job and its name
    final Job job = Job.getInstance(jobConf,
            "Filter and map reads (" + dataName + ", " + Joiner.on(", ").join(filenames) + ")");

    // Set the jar
    job.setJarByClass(ReadsFilterHadoopModule.class);

    // Set input path
    FileInputFormat.addInputPath(job, inputPath);

    // Add genome mapper index to distributed cache

    // Set genome index reference path in the distributed cache
    final Path genomeIndex = new Path(genomeIndexFile.getSource());
    job.addCacheFile(genomeIndex.toUri());

    // Set the input format
    if (inputFormat == READS_FASTQ) {
        job.setInputFormatClass(FastqInputFormat.class);
    } else {
        job.setInputFormatClass(KeyValueTextInputFormat.class);
    }

    // Set the Mappers classes using a chain mapper
    ChainMapper.addMapper(job, ReadsFilterMapper.class, Text.class, Text.class, Text.class, Text.class,
            jobConf);
    ChainMapper.addMapper(job, ReadsMapperMapper.class, Text.class, Text.class, Text.class, Text.class,
            jobConf);
    ChainMapper.addMapper(job, SAMFilterMapper.class, Text.class, Text.class, Text.class, Text.class, jobConf);

    // Set the reducer class
    job.setReducerClass(SAMFilterReducer.class);

    // Set the output format
    job.setOutputFormatClass(SAMOutputFormat.class);

    // Set the output key class
    job.setOutputKeyClass(Text.class);

    // Set the output value class
    job.setOutputValueClass(Text.class);

    // Set output path
    FileOutputFormat.setOutputPath(job, new Path(outFile.getSource()));

    return job;
}

From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java

License:Apache License

public static Job getTripleGraphSizesJob(Configuration config, String[] inputPaths, String outputPath)
        throws IOException {
    Job job = Job.getInstance(config);//from   ww w  .ja v a  2  s .  co m
    job.setJarByClass(JobFactory.class);
    job.setJobName("RDF Triples Graph Sizes");

    // Map/Reduce classes
    ChainMapper.addMapper(job, TriplesToQuadsConstantGraphMapper.class, LongWritable.class,
            TripleWritable.class, LongWritable.class, QuadWritable.class, config);
    ChainMapper.addMapper(job, QuadGraphCountMapper.class, LongWritable.class, QuadWritable.class,
            NodeWritable.class, LongWritable.class, config);
    job.setMapOutputKeyClass(NodeWritable.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setReducerClass(NodeCountReducer.class);

    // Input and Output
    job.setInputFormatClass(TriplesInputFormat.class);
    job.setOutputFormatClass(NTriplesNodeOutputFormat.class);
    FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job;
}