List of usage examples for org.apache.hadoop.mapreduce.lib.chain ChainMapper addMapper
public static void addMapper(Job job, Class<? extends Mapper> klass, Class<?> inputKeyClass, Class<?> inputValueClass, Class<?> outputKeyClass, Class<?> outputValueClass, Configuration mapperConf) throws IOException
From source file:com.zinnia.nectar.regression.hadoop.primitive.jobs.YDiffJob.java
License:Apache License
public Double call() throws Exception { JobControl jobControl = new JobControl("YDiff job"); Job job = new Job(); job.setJarByClass(YDiffJob.class); ChainMapper.addMapper(job, FieldSeperator.FieldSeperationMapper.class, DoubleWritable.class, Text.class, NullWritable.class, Text.class, job.getConfiguration()); ChainMapper.addMapper(job, YDiffMapper.class, NullWritable.class, Text.class, Text.class, DoubleWritable.class, job.getConfiguration()); String fieldSpec = getFieldSpecForColumns(); job.getConfiguration().set("fields.spec", fieldSpec); job.getConfiguration().setStrings("paramValues", paramValues); job.setReducerClass(DoubleSumReducer.class); FileInputFormat.addInputPath(job, new Path(inputFilePath)); FileOutputFormat.setOutputPath(job, new Path(outputFilePath)); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); job.setInputFormatClass(TextInputFormat.class); ControlledJob controlledJob = new ControlledJob(job.getConfiguration()); jobControl.addJob(controlledJob);/* ww w . ja v a 2 s . c o m*/ Thread thread = new Thread(jobControl); thread.start(); while (!jobControl.allFinished()) { Thread.sleep(10000); } jobControl.stop(); FileSystem fs = FileSystem.get(job.getConfiguration()); FSDataInputStream in = fs.open(new Path(outputFilePath + "/part-r-00000")); BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(in)); String valueLine = bufferedReader.readLine(); String[] fields = valueLine.split("\t"); double value = Double.parseDouble(fields[1]); bufferedReader.close(); in.close(); return value; }
From source file:fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.FilterAndMapReadsHadoopModule.java
License:LGPL
private Job createJobConf(final Configuration parentConf, final TaskContext context, final String dataName, final DataFile inFile, final List<String> filenames, final boolean pairedEnd, final DataFormat inputFormat, final FastqFormat fastqFormat, final DataFile genomeIndexFile, final DataFile outFile) throws IOException { final Configuration jobConf = new Configuration(parentConf); // Set input path final Path inputPath = new Path(inFile.getSource()); // Set counter group jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, getCounterGroup()); ////ww w. j ava 2s.co m // Reads filters parameters // // Set fastq format jobConf.set(ReadsFilterMapper.FASTQ_FORMAT_KEY, fastqFormat.getName()); // Set read filter parameters addParametersToJobConf(getReadFilterParameters(), READ_FILTER_PARAMETER_KEY_PREFIX, jobConf); // // Reads mapping parameters // // Set mapper name jobConf.set(ReadsMapperMapper.MAPPER_NAME_KEY, getMapperName()); // Set mapper version jobConf.set(ReadsMapperMapper.MAPPER_VERSION_KEY, getMapperVersion()); // Set mapper flavor jobConf.set(ReadsMapperMapper.MAPPER_FLAVOR_KEY, getMapperFlavor()); // Set pair end or single end mode jobConf.set(ReadsMapperMapper.PAIR_END_KEY, Boolean.toString(pairedEnd)); // Set the number of threads for the mapper if (getMapperHadoopThreads() < 0) { jobConf.set(ReadsMapperMapper.MAPPER_THREADS_KEY, "" + getMapperHadoopThreads()); } // Set mapper arguments if (getMapperArguments() != null) { jobConf.set(ReadsMapperMapper.MAPPER_ARGS_KEY, getMapperArguments()); } // Set Mapper fastq format jobConf.set(ReadsMapperMapper.FASTQ_FORMAT_KEY, "" + fastqFormat); // Set mapper index checksum jobConf.set(ReadsMapperMapper.INDEX_CHECKSUM_KEY, "" + computeZipCheckSum(genomeIndexFile, parentConf)); // timeout jobConf.set("mapreduce.task.timeout", "" + HADOOP_TIMEOUT); // Don't reuse JVM jobConf.set("mapreduce.job.jvm.numtasks", "" + 1); // Set the memory required by the reads mapper jobConf.set("mapreduce.map.memory.mb", "" + getMapperHadoopMemoryRequired()); // Set the memory required by JVM (BWA need more memory than the other // mapper for buffering named pipes) jobConf.set("mapreduce.map.java.opts", "-Xmx4096M"); // Set ZooKeeper client configuration setZooKeeperJobConfiguration(jobConf, context); // // Alignment filtering // // Set SAM filter parameters addParametersToJobConf(getAlignmentsFilterParameters(), MAP_FILTER_PARAMETER_KEY_PREFIX, jobConf); // // Job creation // // Create the job and its name final Job job = Job.getInstance(jobConf, "Filter and map reads (" + dataName + ", " + Joiner.on(", ").join(filenames) + ")"); // Set the jar job.setJarByClass(ReadsFilterHadoopModule.class); // Set input path FileInputFormat.addInputPath(job, inputPath); // Add genome mapper index to distributed cache // Set genome index reference path in the distributed cache final Path genomeIndex = new Path(genomeIndexFile.getSource()); job.addCacheFile(genomeIndex.toUri()); // Set the input format if (inputFormat == READS_FASTQ) { job.setInputFormatClass(FastqInputFormat.class); } else { job.setInputFormatClass(KeyValueTextInputFormat.class); } // Set the Mappers classes using a chain mapper ChainMapper.addMapper(job, ReadsFilterMapper.class, Text.class, Text.class, Text.class, Text.class, jobConf); ChainMapper.addMapper(job, ReadsMapperMapper.class, Text.class, Text.class, Text.class, Text.class, jobConf); ChainMapper.addMapper(job, SAMFilterMapper.class, Text.class, Text.class, Text.class, Text.class, jobConf); // Set the reducer class job.setReducerClass(SAMFilterReducer.class); // Set the output format job.setOutputFormatClass(SAMOutputFormat.class); // Set the output key class job.setOutputKeyClass(Text.class); // Set the output value class job.setOutputValueClass(Text.class); // Set output path FileOutputFormat.setOutputPath(job, new Path(outFile.getSource())); return job; }
From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java
License:Apache License
public static Job getTripleGraphSizesJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { Job job = Job.getInstance(config);//from ww w .ja v a 2 s . co m job.setJarByClass(JobFactory.class); job.setJobName("RDF Triples Graph Sizes"); // Map/Reduce classes ChainMapper.addMapper(job, TriplesToQuadsConstantGraphMapper.class, LongWritable.class, TripleWritable.class, LongWritable.class, QuadWritable.class, config); ChainMapper.addMapper(job, QuadGraphCountMapper.class, LongWritable.class, QuadWritable.class, NodeWritable.class, LongWritable.class, config); job.setMapOutputKeyClass(NodeWritable.class); job.setMapOutputValueClass(LongWritable.class); job.setReducerClass(NodeCountReducer.class); // Input and Output job.setInputFormatClass(TriplesInputFormat.class); job.setOutputFormatClass(NTriplesNodeOutputFormat.class); FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job; }