List of usage examples for org.apache.hadoop.mapreduce Job Job
Job(JobConf conf) throws IOException
From source file:com.ml.hadoop.nlp.DocumentProcessor.java
License:Apache License
/** * Convert the input documents into token array using the {@link StringTuple} The input documents has to be * in the {@link org.apache.hadoop.io.SequenceFile} format * /* w w w .j ava2 s. c o m*/ * @param input * input directory of the documents in {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the {@link StringTuple} token array of each document has to be created * @param analyzerClass * The Lucene {@link Analyzer} for tokenizing the UTF-8 text */ public static void tokenizeDocuments(Path input, Class<? extends Analyzer> analyzerClass, Path output, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.set(ANALYZER_CLASS, analyzerClass.getName()); Job job = new Job(conf); job.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: " + input); job.setJarByClass(DocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(SequenceFileTokenizerMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:com.mongodb.hadoop.input.DelegatingInputFormat.java
License:Apache License
@SuppressWarnings("unchecked") public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); Job jobCopy = new Job(conf); List<InputSplit> splits = new ArrayList<InputSplit>(); Map<Path, InputFormat> formatMap = MongoMultipleInputs.getInputFormatMap(job); Map<Path, Class<? extends Mapper>> mapperMap = MongoMultipleInputs.getMapperTypeMap(job); // Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<Class<? extends InputFormat>, List<Path>>(); for (Entry<Path, InputFormat> entry : formatMap.entrySet()) { InputFormat formatClass = (InputFormat) ReflectionUtils.newInstance(entry.getValue().getClass(), conf); Class<? extends Mapper> mapperClass; mapperClass = mapperMap.get(entry.getKey()); try {//from w ww . j a va 2s . co m List<InputSplit> pathSplits = ((MongoInputFormat) formatClass).getSplits(jobCopy, entry.getKey()); for (InputSplit pathSplit : pathSplits) { splits.add(TaggedInputSplitGenerator.getTaggedInputSplit(pathSplit, conf, formatClass.getClass(), mapperClass)); } } catch (ClassCastException e) { List<InputSplit> pathSplits = formatClass.getSplits(jobCopy); for (InputSplit pathSplit : pathSplits) { splits.add(TaggedInputSplitGenerator.getTaggedInputSplit(pathSplit, conf, formatClass.getClass(), mapperClass)); } } } return splits; }
From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java
License:Apache License
/** * Builds a runnable MapReduce job./*from ww w . ja v a 2s . c om*/ * * @return A configured MapReduce job, ready to be run. * @throws IOException If there is an error. */ public final FijiMapReduceJob build() throws IOException { Preconditions.checkNotNull(mConf, "Must set the job base configuration using .withConf()"); final Job job = new Job(mConf); configureJob(job); return build(job); }
From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java
License:Apache License
public Job setupJob(String jobName, Path outputFile, Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass, EntityId startKey, EntityId limitKey, FijiRowFilter filter) throws Exception { final Job job = new Job(createConfiguration()); final Configuration conf = job.getConfiguration(); // Get settings for test. final FijiDataRequest request = FijiDataRequest.builder() .addColumns(ColumnsDef.create().add("info", "name").add("info", "email")).build(); job.setJarByClass(IntegrationTestFijiTableInputFormat.class); // Setup the InputFormat. FijiTableInputFormat.configureJob(job, getFooTable().getURI(), request, startKey, limitKey, filter); job.setInputFormatClass(HBaseFijiTableInputFormat.class); // Duplicate functionality from MapReduceJobBuilder, since we are not using it here: final List<Path> jarFiles = Lists.newArrayList(); final FileSystem fs = FileSystem.getLocal(conf); for (String cpEntry : System.getProperty("java.class.path").split(":")) { if (cpEntry.endsWith(".jar")) { jarFiles.add(fs.makeQualified(new Path(cpEntry))); }//from ww w. j av a2 s.com } DistributedCacheJars.addJarsToDistributedCache(job, jarFiles); // Create a test job. job.setJobName(jobName); // Setup the OutputFormat. TextOutputFormat.setOutputPath(job, outputFile.getParent()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); // Set the mapper class. if (null != mapperClass) { job.setMapperClass(mapperClass); } // Set the reducer class. if (null != reducerClass) { job.setReducerClass(reducerClass); } return job; }
From source file:com.moz.fiji.mapreduce.lib.reduce.KeyPassThroughReducer.java
License:Apache License
/** {@inheritDoc} */ @Override/*from www .ja v a 2s .c o m*/ public Schema getAvroKeyWriterSchema() throws IOException { Class<? extends Mapper<?, ?, ?, ?>> mapperClass; try { mapperClass = new Job(getConf()).getMapperClass(); } catch (ClassNotFoundException e) { throw new IOException("Mapper class was not configured. " + "Could not infer avro key writer schema.", e); } Mapper<?, ?, ?, ?> mapper = ReflectionUtils.newInstance(mapperClass, getConf()); if (mapper instanceof AvroKeyWriter) { LOG.info("Mapper is an AvroKeyWriter. Using the same schema for Reducer output keys."); return ((AvroKeyWriter) mapper).getAvroKeyWriterSchema(); } return null; }
From source file:com.moz.fiji.mapreduce.reducer.IdentityReducer.java
License:Apache License
/** {@inheritDoc} */ @Override/*from w ww .j a v a 2 s . c o m*/ public Schema getAvroValueWriterSchema() throws IOException { Class<? extends Mapper<?, ?, ?, ?>> mapperClass; try { mapperClass = new Job(getConf()).getMapperClass(); } catch (ClassNotFoundException e) { throw new IOException("Mapper class was not configured. " + "Could not infer avro value writer schema.", e); } Mapper<?, ?, ?, ?> mapper = ReflectionUtils.newInstance(mapperClass, getConf()); if (mapper instanceof AvroValueWriter) { LOG.info("Mapper is an AvroValueWriter. Using the same schema for Reducer output values."); return ((AvroValueWriter) mapper).getAvroValueWriterSchema(); } return null; }
From source file:com.mozilla.hadoop.Backup.java
License:Apache License
/** * @param args//from www .j av a2 s. com * @return * @throws IOException * @throws ParseException */ public Job initJob(String[] args) throws IOException, ParseException { Path inputPath = null; Path loadPath = null; String outputPath = null; boolean useSpecifiedPaths = false; for (int idx = 0; idx < args.length; idx++) { if ("-f".equals(args[idx])) { useSpecifiedPaths = true; loadPath = new Path(args[++idx]); } else if (idx == args.length - 1) { outputPath = args[idx]; } else { inputPath = new Path(args[idx]); } } Path mrOutputPath = new Path(NAME + "-results"); conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.set("backup.input.path", inputPath.toString()); conf.set("backup.output.path", outputPath); FileSystem inputFs = null; FileSystem outputFs = null; Path[] inputSources = null; try { inputFs = FileSystem.get(inputPath.toUri(), new Configuration()); outputFs = FileSystem.get(getConf()); if (useSpecifiedPaths) { inputSources = createInputSources(loadPaths(outputFs, loadPath), outputFs); } else { inputSources = createInputSources(getPaths(inputFs, inputPath, 0, 2), outputFs); } } finally { checkAndClose(inputFs); checkAndClose(outputFs); } Job job = new Job(getConf()); job.setJobName(NAME); job.setJarByClass(Backup.class); job.setMapperClass(BackupMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); for (Path source : inputSources) { System.out.println("Adding input path: " + source.toString()); FileInputFormat.addInputPath(job, source); } FileOutputFormat.setOutputPath(job, mrOutputPath); return job; }
From source file:com.mozilla.socorro.hadoop.CrashCountToHbase.java
License:LGPL
/** * @param args/*from ww w. j a v a 2s. com*/ * @return * @throws IOException * @throws ParseException */ public Job initJob(String[] args) throws IOException { Job job = new Job(getConf()); job.setJobName(NAME); job.setJarByClass(CrashCountToHbase.class); FileInputFormat.addInputPath(job, new Path(args[0])); job.setMapperClass(CrashCountToHBaseMapper.class); job.setReducerClass(CrashCountToHBaseReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job; }
From source file:com.mozilla.socorro.hadoop.CrashReportJob.java
License:LGPL
/** * @param args/*from w w w .j a va 2s .co m*/ * @return * @throws IOException * @throws ParseException */ public static Job initJob(String jobName, Configuration conf, Class<?> mainClass, Class<? extends TableMapper> mapperClass, Class<? extends Reducer> combinerClass, Class<? extends Reducer> reducerClass, Map<byte[], byte[]> columns, Class<? extends WritableComparable> keyOut, Class<? extends Writable> valueOut, Path outputPath) throws IOException, ParseException { // Set both start/end time and start/stop row Calendar startCal = Calendar.getInstance(); Calendar endCal = Calendar.getInstance(); SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd"); String startDateStr = conf.get(START_DATE); String endDateStr = conf.get(END_DATE); if (!StringUtils.isBlank(startDateStr)) { startCal.setTime(sdf.parse(startDateStr)); } if (!StringUtils.isBlank(endDateStr)) { endCal.setTime(sdf.parse(endDateStr)); } conf.setLong(START_TIME, startCal.getTimeInMillis()); conf.setLong(END_TIME, DateUtil.getEndTimeAtResolution(endCal.getTimeInMillis(), Calendar.DATE)); Job job = new Job(conf); job.setJobName(jobName); job.setJarByClass(mainClass); // input table configuration Scan[] scans = MultiScanTableMapReduceUtil.generateScans(startCal, endCal, columns, 100, false); MultiScanTableMapReduceUtil.initMultiScanTableMapperJob(TABLE_NAME_CRASH_REPORTS, scans, mapperClass, keyOut, valueOut, job); if (combinerClass != null) { job.setCombinerClass(combinerClass); } if (reducerClass != null) { job.setReducerClass(reducerClass); } else { job.setNumReduceTasks(0); } FileOutputFormat.setOutputPath(job, outputPath); return job; }
From source file:com.netflix.Aegisthus.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); job.setJarByClass(Aegisthus.class); CommandLine cl = getOptions(args);//from w ww. ja v a2s . c o m if (cl == null) { return 1; } job.setInputFormatClass(AegisthusInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(Map.class); job.setReducerClass(CassReducer.class); List<Path> paths = Lists.newArrayList(); if (cl.hasOption(OPT_INPUT)) { for (String input : cl.getOptionValues(OPT_INPUT)) { paths.add(new Path(input)); } } if (cl.hasOption(OPT_INPUTDIR)) { paths.addAll(getDataFiles(job.getConfiguration(), cl.getOptionValue(OPT_INPUTDIR))); } TextInputFormat.setInputPaths(job, paths.toArray(new Path[0])); TextOutputFormat.setOutputPath(job, new Path(cl.getOptionValue(OPT_OUTPUT))); job.submit(); System.out.println(job.getJobID()); System.out.println(job.getTrackingURL()); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }