Example usage for org.apache.hadoop.mapreduce Job Job

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job Job.

Prototype

Job(JobConf conf) throws IOException

Source Link

Usage

From source file:com.ml.hadoop.nlp.DocumentProcessor.java

License:Apache License

/**
 * Convert the input documents into token array using the {@link StringTuple} The input documents has to be
 * in the {@link org.apache.hadoop.io.SequenceFile} format
 * /* w w w  .j ava2  s.  c  o m*/
 * @param input
 *          input directory of the documents in {@link org.apache.hadoop.io.SequenceFile} format
 * @param output
 *          output directory were the {@link StringTuple} token array of each document has to be created
 * @param analyzerClass
 *          The Lucene {@link Analyzer} for tokenizing the UTF-8 text
 */
public static void tokenizeDocuments(Path input, Class<? extends Analyzer> analyzerClass, Path output,
        Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = new Configuration(baseConf);
    // this conf parameter needs to be set enable serialisation of conf values
    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    conf.set(ANALYZER_CLASS, analyzerClass.getName());

    Job job = new Job(conf);
    job.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: " + input);
    job.setJarByClass(DocumentProcessor.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(StringTuple.class);
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapperClass(SequenceFileTokenizerMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setNumReduceTasks(0);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    HadoopUtil.delete(conf, output);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }

}

From source file:com.mongodb.hadoop.input.DelegatingInputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    Job jobCopy = new Job(conf);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Map<Path, InputFormat> formatMap = MongoMultipleInputs.getInputFormatMap(job);
    Map<Path, Class<? extends Mapper>> mapperMap = MongoMultipleInputs.getMapperTypeMap(job);
    //     Map<Class<? extends InputFormat>, List<Path>> formatPaths   = new HashMap<Class<? extends InputFormat>, List<Path>>();

    for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
        InputFormat formatClass = (InputFormat) ReflectionUtils.newInstance(entry.getValue().getClass(), conf);
        Class<? extends Mapper> mapperClass;
        mapperClass = mapperMap.get(entry.getKey());
        try {//from  w ww .  j a va 2s  .  co  m
            List<InputSplit> pathSplits = ((MongoInputFormat) formatClass).getSplits(jobCopy, entry.getKey());
            for (InputSplit pathSplit : pathSplits) {
                splits.add(TaggedInputSplitGenerator.getTaggedInputSplit(pathSplit, conf,
                        formatClass.getClass(), mapperClass));
            }
        } catch (ClassCastException e) {
            List<InputSplit> pathSplits = formatClass.getSplits(jobCopy);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(TaggedInputSplitGenerator.getTaggedInputSplit(pathSplit, conf,
                        formatClass.getClass(), mapperClass));
            }
        }
    }
    return splits;
}

From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java

License:Apache License

/**
 * Builds a runnable MapReduce job./*from  ww w  .  ja v  a  2s . c om*/
 *
 * @return A configured MapReduce job, ready to be run.
 * @throws IOException If there is an error.
 */
public final FijiMapReduceJob build() throws IOException {
    Preconditions.checkNotNull(mConf, "Must set the job base configuration using .withConf()");
    final Job job = new Job(mConf);
    configureJob(job);
    return build(job);
}

From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java

License:Apache License

public Job setupJob(String jobName, Path outputFile, Class<? extends Mapper> mapperClass,
        Class<? extends Reducer> reducerClass, EntityId startKey, EntityId limitKey, FijiRowFilter filter)
        throws Exception {
    final Job job = new Job(createConfiguration());
    final Configuration conf = job.getConfiguration();

    // Get settings for test.
    final FijiDataRequest request = FijiDataRequest.builder()
            .addColumns(ColumnsDef.create().add("info", "name").add("info", "email")).build();

    job.setJarByClass(IntegrationTestFijiTableInputFormat.class);

    // Setup the InputFormat.
    FijiTableInputFormat.configureJob(job, getFooTable().getURI(), request, startKey, limitKey, filter);
    job.setInputFormatClass(HBaseFijiTableInputFormat.class);

    // Duplicate functionality from MapReduceJobBuilder, since we are not using it here:
    final List<Path> jarFiles = Lists.newArrayList();
    final FileSystem fs = FileSystem.getLocal(conf);
    for (String cpEntry : System.getProperty("java.class.path").split(":")) {
        if (cpEntry.endsWith(".jar")) {
            jarFiles.add(fs.makeQualified(new Path(cpEntry)));
        }//from ww  w.  j av  a2 s.com
    }
    DistributedCacheJars.addJarsToDistributedCache(job, jarFiles);

    // Create a test job.
    job.setJobName(jobName);

    // Setup the OutputFormat.
    TextOutputFormat.setOutputPath(job, outputFile.getParent());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    // Set the mapper class.
    if (null != mapperClass) {
        job.setMapperClass(mapperClass);
    }
    // Set the reducer class.
    if (null != reducerClass) {
        job.setReducerClass(reducerClass);
    }

    return job;
}

From source file:com.moz.fiji.mapreduce.lib.reduce.KeyPassThroughReducer.java

License:Apache License

/** {@inheritDoc} */
@Override/*from www  .ja v a 2s  .c  o m*/
public Schema getAvroKeyWriterSchema() throws IOException {
    Class<? extends Mapper<?, ?, ?, ?>> mapperClass;
    try {
        mapperClass = new Job(getConf()).getMapperClass();
    } catch (ClassNotFoundException e) {
        throw new IOException("Mapper class was not configured. " + "Could not infer avro key writer schema.",
                e);
    }
    Mapper<?, ?, ?, ?> mapper = ReflectionUtils.newInstance(mapperClass, getConf());
    if (mapper instanceof AvroKeyWriter) {
        LOG.info("Mapper is an AvroKeyWriter. Using the same schema for Reducer output keys.");
        return ((AvroKeyWriter) mapper).getAvroKeyWriterSchema();
    }
    return null;
}

From source file:com.moz.fiji.mapreduce.reducer.IdentityReducer.java

License:Apache License

/** {@inheritDoc} */
@Override/*from  w  ww  .j  a  v a 2 s . c o m*/
public Schema getAvroValueWriterSchema() throws IOException {
    Class<? extends Mapper<?, ?, ?, ?>> mapperClass;
    try {
        mapperClass = new Job(getConf()).getMapperClass();
    } catch (ClassNotFoundException e) {
        throw new IOException("Mapper class was not configured. " + "Could not infer avro value writer schema.",
                e);
    }
    Mapper<?, ?, ?, ?> mapper = ReflectionUtils.newInstance(mapperClass, getConf());
    if (mapper instanceof AvroValueWriter) {
        LOG.info("Mapper is an AvroValueWriter. Using the same schema for Reducer output values.");
        return ((AvroValueWriter) mapper).getAvroValueWriterSchema();
    }
    return null;
}

From source file:com.mozilla.hadoop.Backup.java

License:Apache License

/**
 * @param args//from  www  .j  av a2 s.  com
 * @return
 * @throws IOException
 * @throws ParseException 
 */
public Job initJob(String[] args) throws IOException, ParseException {

    Path inputPath = null;
    Path loadPath = null;
    String outputPath = null;
    boolean useSpecifiedPaths = false;
    for (int idx = 0; idx < args.length; idx++) {
        if ("-f".equals(args[idx])) {
            useSpecifiedPaths = true;
            loadPath = new Path(args[++idx]);
        } else if (idx == args.length - 1) {
            outputPath = args[idx];
        } else {
            inputPath = new Path(args[idx]);
        }
    }

    Path mrOutputPath = new Path(NAME + "-results");

    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.set("backup.input.path", inputPath.toString());
    conf.set("backup.output.path", outputPath);

    FileSystem inputFs = null;
    FileSystem outputFs = null;
    Path[] inputSources = null;
    try {
        inputFs = FileSystem.get(inputPath.toUri(), new Configuration());
        outputFs = FileSystem.get(getConf());
        if (useSpecifiedPaths) {
            inputSources = createInputSources(loadPaths(outputFs, loadPath), outputFs);
        } else {
            inputSources = createInputSources(getPaths(inputFs, inputPath, 0, 2), outputFs);
        }
    } finally {
        checkAndClose(inputFs);
        checkAndClose(outputFs);
    }

    Job job = new Job(getConf());
    job.setJobName(NAME);
    job.setJarByClass(Backup.class);

    job.setMapperClass(BackupMapper.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(0);

    job.setInputFormatClass(TextInputFormat.class);

    for (Path source : inputSources) {
        System.out.println("Adding input path: " + source.toString());
        FileInputFormat.addInputPath(job, source);
    }

    FileOutputFormat.setOutputPath(job, mrOutputPath);

    return job;
}

From source file:com.mozilla.socorro.hadoop.CrashCountToHbase.java

License:LGPL

/**
 * @param args/*from  ww  w. j  a  v  a 2s.  com*/
 * @return
 * @throws IOException
 * @throws ParseException
 */
public Job initJob(String[] args) throws IOException {
    Job job = new Job(getConf());
    job.setJobName(NAME);
    job.setJarByClass(CrashCountToHbase.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));

    job.setMapperClass(CrashCountToHBaseMapper.class);
    job.setReducerClass(CrashCountToHBaseReducer.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job;
}

From source file:com.mozilla.socorro.hadoop.CrashReportJob.java

License:LGPL

/**
 * @param args/*from w w  w .j  a va  2s  .co m*/
 * @return
 * @throws IOException
 * @throws ParseException
 */
public static Job initJob(String jobName, Configuration conf, Class<?> mainClass,
        Class<? extends TableMapper> mapperClass, Class<? extends Reducer> combinerClass,
        Class<? extends Reducer> reducerClass, Map<byte[], byte[]> columns,
        Class<? extends WritableComparable> keyOut, Class<? extends Writable> valueOut, Path outputPath)
        throws IOException, ParseException {
    // Set both start/end time and start/stop row
    Calendar startCal = Calendar.getInstance();
    Calendar endCal = Calendar.getInstance();

    SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");

    String startDateStr = conf.get(START_DATE);
    String endDateStr = conf.get(END_DATE);
    if (!StringUtils.isBlank(startDateStr)) {
        startCal.setTime(sdf.parse(startDateStr));
    }
    if (!StringUtils.isBlank(endDateStr)) {
        endCal.setTime(sdf.parse(endDateStr));
    }

    conf.setLong(START_TIME, startCal.getTimeInMillis());
    conf.setLong(END_TIME, DateUtil.getEndTimeAtResolution(endCal.getTimeInMillis(), Calendar.DATE));

    Job job = new Job(conf);
    job.setJobName(jobName);
    job.setJarByClass(mainClass);

    // input table configuration
    Scan[] scans = MultiScanTableMapReduceUtil.generateScans(startCal, endCal, columns, 100, false);
    MultiScanTableMapReduceUtil.initMultiScanTableMapperJob(TABLE_NAME_CRASH_REPORTS, scans, mapperClass,
            keyOut, valueOut, job);

    if (combinerClass != null) {
        job.setCombinerClass(combinerClass);
    }

    if (reducerClass != null) {
        job.setReducerClass(reducerClass);
    } else {
        job.setNumReduceTasks(0);
    }

    FileOutputFormat.setOutputPath(job, outputPath);

    return job;
}

From source file:com.netflix.Aegisthus.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());

    job.setJarByClass(Aegisthus.class);
    CommandLine cl = getOptions(args);//from   w  ww.  ja v a2s  .  c  o  m
    if (cl == null) {
        return 1;
    }
    job.setInputFormatClass(AegisthusInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(CassReducer.class);
    List<Path> paths = Lists.newArrayList();
    if (cl.hasOption(OPT_INPUT)) {
        for (String input : cl.getOptionValues(OPT_INPUT)) {
            paths.add(new Path(input));
        }
    }
    if (cl.hasOption(OPT_INPUTDIR)) {
        paths.addAll(getDataFiles(job.getConfiguration(), cl.getOptionValue(OPT_INPUTDIR)));
    }
    TextInputFormat.setInputPaths(job, paths.toArray(new Path[0]));
    TextOutputFormat.setOutputPath(job, new Path(cl.getOptionValue(OPT_OUTPUT)));

    job.submit();
    System.out.println(job.getJobID());
    System.out.println(job.getTrackingURL());
    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
}