Example usage for org.apache.hadoop.mapreduce.lib.output LazyOutputFormat setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output LazyOutputFormat setOutputFormatClass.

Prototype

@SuppressWarnings("unchecked")
public static void setOutputFormatClass(Job job, Class<? extends OutputFormat> theClass)

Source Link

Document

Set the underlying output format for LazyOutputFormat.

Usage

From source file:com.baynote.kafka.hadoop.KafkaJobBuilder.java

License:Apache License

/**
 * Creates a {@link Job} based on how {@code this} {@link KafkaJobBuilder} has been configured. There are no
 * side-effects on {@code this} instance when you call this method, so you can call it multiple times.
 * //  www . ja  v a  2  s. com
 * @param conf
 *            the job conf.
 * @return a fully configured {@link Job}.
 * @throws Exception error
 * @throws IllegalArgumentException
 *             if any required parameters are not set.
 */
public Job configureJob(final Configuration conf) throws Exception {
    validateSettings();
    final Job job = Job.getInstance(conf, getDefaultedJobName());

    // set queue inputs
    if (getQueueMappers().size() == 1) {
        job.setInputFormatClass(KafkaInputFormat.class);
        final TopicConf topicConf = Iterables.getOnlyElement(getQueueMappers());
        KafkaInputFormat.setTopic(job, topicConf.getTopic());
        KafkaInputFormat.setConsumerGroup(job, topicConf.getConsumerGroup());
        job.setMapperClass(topicConf.getMapper());
    } else {
        job.setInputFormatClass(MultipleKafkaInputFormat.class);
        for (final TopicConf topicConf : getQueueMappers()) {
            MultipleKafkaInputFormat.addTopic(job, topicConf.getTopic(), topicConf.getConsumerGroup(),
                    topicConf.getMapper());
        }
    }

    if (getMapOutputKeyClass() != null) {
        job.setMapOutputKeyClass(getMapOutputKeyClass());
    }

    if (getMapOutputValueClass() != null) {
        job.setMapOutputValueClass(getMapOutputValueClass());
    }

    if (getReducerClass() == null) {
        job.setNumReduceTasks(0);
    } else {
        job.setReducerClass(getReducerClass());
        job.setNumReduceTasks(getNumReduceTasks());
    }

    if (getPartitionerClass() != null) {
        job.setPartitionerClass(getPartitionerClass());
    }

    // set output
    job.setOutputFormatClass(getOutputFormatClass());
    job.setOutputKeyClass(getOutputKeyClass());
    job.setOutputValueClass(getOutputValueClass());
    if (getOutputFormat() == SupportedOutputFormat.TEXT_FILE) {
        TextOutputFormat.setOutputPath(job, getDefaultedOutputPath());
    } else if (getOutputFormat() == SupportedOutputFormat.SEQUENCE_FILE) {
        SequenceFileOutputFormat.setOutputPath(job, getDefaultedOutputPath());
    }

    if (usingS3()) {
        job.getConfiguration().set("fs.s3n.awsAccessKeyId", getS3AccessKey());
        job.getConfiguration().set("fs.s3n.awsSecretAccessKey", getS3SecretyKey());
        job.getConfiguration().set("fs.s3.awsAccessKeyId", getS3AccessKey());
        job.getConfiguration().set("fs.s3.awsSecretAccessKey", getS3SecretyKey());
    }

    if (isLazyOutputFormat()) {
        LazyOutputFormat.setOutputFormatClass(job, getOutputFormatClass());
    }

    // setup kafka input format specifics
    KafkaInputFormat.setZkConnect(job, getZkConnect());
    KafkaInputFormat.setKafkaFetchSizeBytes(job, getKafkaFetchSizeBytes());

    job.setSpeculativeExecution(false);
    job.setJarByClass(getClass());

    // memory settings for mappers
    if (!Strings.isNullOrEmpty(getTaskMemorySettings())) {
        job.getConfiguration().set("mapred.child.java.opts", getTaskMemorySettings());
    }

    return job;
}

From source file:com.bizosys.hsearch.kv.indexer.KVIndexer.java

License:Apache License

private static int runJob(int jobTypeI, Job job, FieldMapping fm, String input, String output,
        int scannerCacheSize, String filter) throws IOException, InterruptedException, ClassNotFoundException {

    int jobStatus = -1;

    switch (jobTypeI) {
    case SF2HB: {

        IdSearchLog.l.info("Starting Job for SF2HB input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and output folder " + output);

        FileInputFormat.addInputPath(job, new Path(input));

        job.setMapperClass(KVMapperFile.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);

        job.setReducerClass(KVReducerHBase.class);
        TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job);
        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }/*  ww w .  j  a va 2  s  .c o  m*/
    case SF2MF: {

        IdSearchLog.l.info("Starting Job for SF2MF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and output folder " + output);

        FileInputFormat.addInputPath(job, new Path(input));

        job.setMapperClass(KVMapperFile.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;

    }
    case SF2HF: {

        /*
         * First creates map file and then convert to hfile.
         * create intermediate dir for map file output
         * 
         */

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for SF2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("SF2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case HB2HB: {

        if (fm.tableName.equals(input)) {
            throw new IOException("Input table and index table can not be same");
        }

        Scan scan = new Scan();
        scan.setCaching(scannerCacheSize);
        scan.setCacheBlocks(false);
        scan.addFamily(fm.familyName.getBytes());
        if (null != filter) {
            if (filter.trim().length() > 0) {
                int index = filter.indexOf('=');
                scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                        filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                        filter.substring(index + 1).getBytes()));
            }
        }

        TableMapReduceUtil.initTableMapperJob(input, // input table
                scan, // Scan instance to control CF and attribute selection
                KVMapperHBase.class, // mapper class
                Text.class, // mapper output key
                BytesWritable.class, // mapper output value
                job);

        TableMapReduceUtil.initTableReducerJob(fm.tableName, // output table
                KVReducerHBase.class, // reducer class
                job);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;

    }
    case HB2HF: {

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("HB2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case HB2MF: {

        if (fm.tableName.equals(input)) {
            throw new IOException("Input table and index table can not be same");
        }

        Scan scan = new Scan();
        scan.setCaching(scannerCacheSize);
        scan.setCacheBlocks(false);
        scan.addFamily(fm.familyName.getBytes());

        if (null != filter) {
            if (filter.trim().length() > 0) {
                int index = filter.indexOf('=');
                scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                        filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                        filter.substring(index + 1).getBytes()));
            }
        }

        TableMapReduceUtil.initTableMapperJob(input, // input table
                scan, // Scan instance to control CF and attribute selection
                KVMapperHBase.class, // mapper class
                Text.class, // mapper output key
                BytesWritable.class, // mapper output value
                job);

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case IMF2HF: {

        Path finalOutputDir = new Path(output);
        job.setJarByClass(KVIndexer.class);
        job.setMapperClass(KVMapperHFile.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        SequenceFileInputFormat.addInputPath(job, new Path(input));
        FileOutputFormat.setOutputPath(job, finalOutputDir);

        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);

        HTable hTable = new HTable(job.getConfiguration(), fm.tableName);
        HFileOutputFormat.configureIncrementalLoad(job, hTable);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }

    default:
        throw new IOException("Invalid Jobtype " + jobTypeI);
    }
}

From source file:com.bizosys.hsearch.kv.indexing.KVIndexer.java

License:Apache License

private static int runJob(int jobTypeI, Job job, FieldMapping fm, String input, String output,
        int scannerCacheSize, String filter) throws IOException, InterruptedException, ClassNotFoundException {

    int jobStatus = -1;

    switch (jobTypeI) {
    case SF2HB: {

        IdSearchLog.l.info("Starting Job for SF2HB input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and output folder " + output);

        FileInputFormat.addInputPath(job, new Path(input));

        job.setMapperClass(KVMapperFile.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(KVReducerHBase.class);
        TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job);
        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }/*w  ww  . j  a v  a 2s . c o m*/
    case SF2HF: {

        //First creates map file and then convert to hfile.
        //create intermediate dir for map file output

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for SF2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("SF2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case SF2MF: {

        IdSearchLog.l.info("Starting Job for SF2MF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and output folder " + output);

        FileInputFormat.addInputPath(job, new Path(input));

        job.setMapperClass(KVMapperFile.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);

        job.setSortComparatorClass(TextPair.FirstComparator.class);

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;

    }
    case MF2HB: {

        job.setMapperClass(KVMapperMapFile.class);
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);
        SequenceFileAsTextInputFormat.addInputPath(job, new Path(input));

        job.setReducerClass(KVReducerHBase.class);
        TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case MF2HF: {

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("MF2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case MF2MF: {

        job.setMapperClass(KVMapperMapFile.class);
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);
        SequenceFileAsTextInputFormat.addInputPath(job, new Path(input));

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case HB2HB: {

        if (fm.tableName.equals(input)) {
            throw new IOException("Input table and index table can not be same");
        }

        Scan scan = new Scan();
        scan.setCaching(scannerCacheSize);
        scan.setCacheBlocks(false);
        scan.addFamily(fm.familyName.getBytes());
        if (null != filter) {
            if (filter.trim().length() > 0) {
                int index = filter.indexOf('=');
                scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                        filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                        filter.substring(index + 1).getBytes()));
            }
        }

        TableMapReduceUtil.initTableMapperJob(input, // input table
                scan, // Scan instance to control CF and attribute selection
                KVMapperHBase.class, // mapper class
                TextPair.class, // mapper output key
                Text.class, // mapper output value
                job);

        TableMapReduceUtil.initTableReducerJob(fm.tableName, // output table
                KVReducerHBase.class, // reducer class
                job);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;

    }
    case HB2HF: {

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("HB2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case HB2MF: {

        if (fm.tableName.equals(input)) {
            throw new IOException("Input table and index table can not be same");
        }

        Scan scan = new Scan();
        scan.setCaching(scannerCacheSize);
        scan.setCacheBlocks(false);
        scan.addFamily(fm.familyName.getBytes());

        if (null != filter) {
            if (filter.trim().length() > 0) {
                int index = filter.indexOf('=');
                scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                        filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                        filter.substring(index + 1).getBytes()));
            }
        }

        TableMapReduceUtil.initTableMapperJob(input, // input table
                scan, // Scan instance to control CF and attribute selection
                KVMapperHBase.class, // mapper class
                TextPair.class, // mapper output key
                Text.class, // mapper output value
                job);

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case IMF2HF: {

        Path finalOutputDir = new Path(output);
        job.setJarByClass(KVIndexer.class);
        job.setMapperClass(KVMapperHFile.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        SequenceFileInputFormat.addInputPath(job, new Path(input));
        FileOutputFormat.setOutputPath(job, finalOutputDir);

        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);

        HTable hTable = new HTable(job.getConfiguration(), fm.tableName);
        HFileOutputFormat.configureIncrementalLoad(job, hTable);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }

    default:
        throw new IOException("Invalid Jobtype " + jobTypeI);
    }
}

From source file:com.linkedin.thirdeye.hadoop.derivedcolumn.transformation.DerivedColumnTransformationPhaseJob.java

License:Apache License

public Job run() throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName(name);/*from  ww w.j  a v a2 s. c o  m*/
    job.setJarByClass(DerivedColumnTransformationPhaseJob.class);

    Configuration configuration = job.getConfiguration();
    FileSystem fs = FileSystem.get(configuration);

    // Input Path
    String inputPathDir = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_INPUT_PATH);
    LOGGER.info("Input path dir: " + inputPathDir);
    for (String inputPath : inputPathDir.split(",")) {
        LOGGER.info("Adding input:" + inputPath);
        Path input = new Path(inputPath);
        FileInputFormat.addInputPath(job, input);
    }

    // Topk path
    String topkPath = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH);
    LOGGER.info("Topk path : " + topkPath);

    // Output path
    Path outputPath = new Path(
            getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH));
    LOGGER.info("Output path dir: " + outputPath.toString());
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);
    }
    FileOutputFormat.setOutputPath(job, outputPath);

    // Schema
    Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
    LOGGER.info("Schema : {}", avroSchema.toString(true));

    // ThirdEyeConfig
    String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(
            props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()),
            props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(),
            OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));
    LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode());

    // New schema
    Schema outputSchema = newSchema(thirdeyeConfig);
    job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(),
            outputSchema.toString());

    // Map config
    job.setMapperClass(DerivedColumnTransformationPhaseMapper.class);
    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(NullWritable.class);
    AvroJob.setOutputKeySchema(job, outputSchema);
    LazyOutputFormat.setOutputFormatClass(job, AvroKeyOutputFormat.class);
    AvroMultipleOutputs.addNamedOutput(job, "avro", AvroKeyOutputFormat.class, outputSchema);

    job.setNumReduceTasks(0);

    job.waitForCompletion(true);

    return job;
}

From source file:com.pagerankcalculator.TwitterPageRank.java

/**
 * Graph Parsing/* w w  w.ja  v a  2 s . c  o m*/
 * Memasukan data mentah dan melakukan inisialisasi pagerank
 * 
 * @param in file data masukan
 * @param out direktori output
 */
public int parseGraph(String in, String out) throws IOException, InterruptedException, ClassNotFoundException {

    Job job = Job.getInstance(getConf());
    job.setJobName("[" + TwitterPageRank.AUTHOR + "]: Job#1 Parsing Graph");
    job.setJarByClass(TwitterPageRank.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(GraphParsingMapper.class);
    job.setReducerClass(GraphParsingReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setNumReduceTasks(TwitterPageRank.NUM_REDUCE_TASKS);

    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    Path inputFilePath = new Path(in);
    Path outputFilePath = new Path(out);

    FileInputFormat.addInputPath(job, inputFilePath);
    FileOutputFormat.setOutputPath(job, outputFilePath);

    FileSystem fs = FileSystem.newInstance(getConf());

    if (fs.exists(outputFilePath)) {
        fs.delete(outputFilePath, true);
    }

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.pagerankcalculator.TwitterPageRank.java

public int calculatePagerank(String in, String out, int iteration)
        throws IOException, InterruptedException, ClassNotFoundException {
    Job job = Job.getInstance(getConf());
    job.setJobName("[" + TwitterPageRank.AUTHOR + "]: Job#2 Iteration-" + iteration + " Calculating Page Rank");
    job.setJarByClass(TwitterPageRank.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(PageRankCalculationMapper.class);
    job.setReducerClass(PageRankCalculationReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setNumReduceTasks(TwitterPageRank.NUM_REDUCE_TASKS);

    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    Path inputFilePath = new Path(in);
    Path outputFilePath = new Path(out);

    FileInputFormat.addInputPath(job, inputFilePath);
    FileOutputFormat.setOutputPath(job, outputFilePath);

    FileSystem fs = FileSystem.newInstance(getConf());

    if (fs.exists(outputFilePath)) {
        fs.delete(outputFilePath, true);
    }/*from w  w  w. ja  va 2 s .  c o m*/

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.pagerankcalculator.TwitterPageRank.java

public int sortPagerank(String in, String out)
        throws IOException, InterruptedException, ClassNotFoundException {
    Job job = Job.getInstance(getConf());
    job.setJobName("[" + TwitterPageRank.AUTHOR + "]: Job#3 Sorting Page Rank");
    job.setJarByClass(TwitterPageRank.class);

    job.setMapOutputKeyClass(DoubleWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(PageRankSortingMapper.class);
    job.setReducerClass(PageRankSortingReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setNumReduceTasks(1);//from   w  w w .  j  av a 2 s.  c o m

    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    job.setSortComparatorClass(DoubleSortDescComparator.class);

    Path inputFilePath = new Path(in);
    Path outputFilePath = new Path(out);

    FileInputFormat.addInputPath(job, inputFilePath);
    FileOutputFormat.setOutputPath(job, outputFilePath);

    FileSystem fs = FileSystem.newInstance(getConf());

    if (fs.exists(outputFilePath)) {
        fs.delete(outputFilePath, true);
    }

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.rw.legion.DefaultJob.java

License:Apache License

/**
 * Main method.//from  w  w w .  j a va  2s .  co  m
 * 
 * @param args  Arguments should be: 1) input path, 2) output path, 3)
 * location of Legion objective file.
 */
public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    // Load the Legion objective from the JSON doc.
    Path path = new Path(args[2]);
    FileSystem fs = FileSystem.get(new URI(args[2]), conf);
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String json = "";

    String line = br.readLine();

    while (line != null) {
        json += line;
        line = br.readLine();
    }

    br.close();

    /*
     *  Save the JSON for the Legion objective to the Hadoop configuration,
     *  so we can access it in other containers.
     */
    conf.setStrings("legion_objective", json);

    // De-serialize the objective so we can access the settings here.
    LegionObjective legionObjective = ObjectiveDeserializer.deserialize(json);

    // Start configuring the MapReduce job.
    Job hadoopJob = Job.getInstance(conf, "Legion");

    hadoopJob.setJarByClass(DefaultJob.class);
    hadoopJob.setMapperClass(DefaultMapper.class);
    LazyOutputFormat.setOutputFormatClass(hadoopJob, TextOutputFormat.class);

    // Compress the output to speed things up.
    TextOutputFormat.setCompressOutput(hadoopJob, true);
    TextOutputFormat.setOutputCompressorClass(hadoopJob, GzipCodec.class);

    // What input format do we use?

    try {
        @SuppressWarnings("unchecked")
        Class<? extends FileInputFormat<NullWritable, LegionRecord>> inputClass = (Class<? extends FileInputFormat<NullWritable, LegionRecord>>) Class
                .forName(legionObjective.getInputFormat());

        hadoopJob.setInputFormatClass(inputClass);
    } catch (Exception e) {
        throw new JsonParseException(
                "Problem loading input format " + "class '" + legionObjective.getInputFormat() + "'");
    }

    // Should we set a max combined size?

    if (legionObjective.getMaxCombinedSize() != null) {
        CombineFileInputFormat.setMaxInputSplitSize(hadoopJob, legionObjective.getMaxCombinedSize());
    }

    /* 
     * These are just static convenience methods, so it doesn't matter if
     * they come from the wrong class.
     */
    FileInputFormat.setInputDirRecursive(hadoopJob, true);
    FileInputFormat.addInputPath(hadoopJob, new Path(args[0]));

    FileOutputFormat.setOutputPath(hadoopJob, new Path(args[1]));

    // Since a Legion objective can specify multiple output tables.
    for (OutputTable outputTable : legionObjective.getOutputTables()) {
        MultipleOutputs.addNamedOutput(hadoopJob, outputTable.getTitle(), TextOutputFormat.class,
                NullWritable.class, Text.class);
    }

    MultipleOutputs.addNamedOutput(hadoopJob, "skipped", TextOutputFormat.class, NullWritable.class,
            Text.class);

    hadoopJob.waitForCompletion(true);
}

From source file:com.twitter.algebra.nmf.ColPartitionJob.java

License:Apache License

/**
 * Partition A on columns, where A refers to the path that contain a matrix in
 * {@link SequenceFileInputFormat}. Refer to {@link ColPartitionJob} for
 * further details.//  ww w. j  av  a  2s.c  o m
 * 
 * @param conf the initial configuration
 * @param matrixInputPath the path to the input matrix A
 * @param matrixOutputPath the path of the resulting partitioned matrix
 * @param numInputRows rows
 * @param numInputCols cols
 * @param numColPartitions the hint for the desired number of column
 *          partitions
 * @return the running job
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public Job run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows,
        int numInputCols, int numColPartitions)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf);
    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "colpartition");

    int colPartSize = getColPartitionSize(numInputCols, numColPartitions);
    numColPartitions = (int) Math.ceil(numInputCols / (double) colPartSize);

    if (numReducers < numColPartitions)
        numReducers = numColPartitions;

    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "colpartition");

    conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows);
    conf.setInt(NUM_ORIG_COLS_KEY, numInputCols);
    conf.setInt(NUM_COL_PARTITIONS, numColPartitions);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ColPartitionJob.class);
    job.setJobName(ColPartitionJob.class.getSimpleName());

    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    MultipleInputs.addInputPath(job, matrixInputPath, SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(ElementWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    RowColPartitioner.setPartitioner(job, RowColPartitioner.ElementRowColPartitioner.class, numInputRows,
            numInputCols, numColPartitions);

    job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(numReducers);

    //    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
    return job;
}

From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.ConfigurationHelper.java

License:Apache License

/**
 * Job configurator//from  w  w  w .  j  a  v a2  s .  c o  m
 *
 * @param job                      job instance
 * @param jarByClass               class of the jar
 * @param mapperClass              mapper
 * @param reducerClass             reducer
 * @param commaSeparatedInputFiles input paths
 * @param outputPath               output
 * @throws IOException I/O exception
 */
public static void configureJob(Job job, Class<?> jarByClass, Class<? extends Mapper> mapperClass,
        Class<? extends Reducer> reducerClass, String commaSeparatedInputFiles, String outputPath)
        throws IOException {
    job.setJarByClass(jarByClass);
    job.setJobName(jarByClass.getName());

    // mapper
    job.setMapperClass(mapperClass);

    // reducer
    job.setReducerClass(reducerClass);

    // input-output is warc
    job.setInputFormatClass(WARCInputFormat.class);
    // prevent producing empty files
    LazyOutputFormat.setOutputFormatClass(job, WARCOutputFormat.class);

    // intermediate data
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(WARCWritable.class);

    // output data
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(WARCWritable.class);

    // set output compression to GZip
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
}