Example usage for org.apache.hadoop.mapred JobConf setOutputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputFormat.

Prototype

public void setOutputFormat(Class<? extends OutputFormat> theClass)

Source Link

Document

Set the OutputFormat implementation for the map-reduce job.

Usage

From source file:cascading.hcatalog.HCatScheme.java

License:Apache License

@Override
public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf conf) {
    conf.setOutputFormat(outputFormat);
    conf.set(HiveProps.HIVE_COLUMNS, (String) tableMetadata.get(HiveProps.HIVE_COLUMNS));
    conf.set(HiveProps.HIVE_COLUMN_TYPES, (String) tableMetadata.get(HiveProps.HIVE_COLUMN_TYPES));
    createSerDe(conf);/* w  w  w.j  av a2 s  .  co m*/
}

From source file:cascading.hive.ORCFile.java

License:Apache License

@Override
public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf conf) {
    conf.setOutputKeyClass(NullWritable.class);
    conf.setOutputValueClass(OrcSerde.OrcSerdeRow.class);
    conf.setOutputFormat(OrcOutputFormat.class);
}

From source file:cascading.hive.RCFile.java

License:Apache License

@Override
public void sinkConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf conf) {
    conf.setOutputKeyClass(WritableComparable.class);
    conf.setOutputValueClass(BytesRefArrayWritable.class);
    conf.setOutputFormat(RCFileOutputFormat.class);
    conf.set(HiveProps.HIVE_COLUMN_NUMBER, String.valueOf(getSinkFields().size()));
}

From source file:cascading.jdbc.db.DBOutputFormat.java

License:Apache License

/**
 * Initializes the reduce-part of the job with the appropriate output settings
 *
 * @param job                 The job/*  ww  w. j a  v a 2 s.  c  o m*/
 * @param dbOutputFormatClass
 * @param tableName           The table to insert data into
 * @param fieldNames          The field names in the table. If unknown, supply the appropriate
 */
public static void setOutput(JobConf job, Class<? extends DBOutputFormat> dbOutputFormatClass, String tableName,
        String[] fieldNames, String[] updateFields, int batchSize) {
    if (dbOutputFormatClass == null)
        job.setOutputFormat(DBOutputFormat.class);
    else
        job.setOutputFormat(dbOutputFormatClass);

    // writing doesn't always happen in reduce
    job.setReduceSpeculativeExecution(false);
    job.setMapSpeculativeExecution(false);

    DBConfiguration dbConf = new DBConfiguration(job);

    dbConf.setOutputTableName(tableName);
    dbConf.setOutputFieldNames(fieldNames);

    if (updateFields != null)
        dbConf.setOutputUpdateFieldNames(updateFields);

    if (batchSize != -1)
        dbConf.setBatchStatementsNum(batchSize);
}

From source file:cascading.scheme.SequenceFile.java

License:Open Source License

@Override
public void sinkInit(Tap tap, JobConf conf) {
    conf.setOutputKeyClass(Tuple.class); // supports TapCollector
    conf.setOutputValueClass(Tuple.class); // supports TapCollector
    conf.setOutputFormat(SequenceFileOutputFormat.class);
}

From source file:cascading.scheme.TextLine.java

License:Open Source License

@Override
public void sinkInit(Tap tap, JobConf conf) throws IOException {
    if (tap.getQualifiedPath(conf).toString().endsWith(".zip"))
        throw new IllegalStateException("cannot write zip files: " + FileOutputFormat.getOutputPath(conf));

    if (getSinkCompression() == Compress.DISABLE)
        conf.setBoolean("mapred.output.compress", false);
    else if (getSinkCompression() == Compress.ENABLE)
        conf.setBoolean("mapred.output.compress", true);

    conf.setOutputKeyClass(Text.class); // be explicit
    conf.setOutputValueClass(Text.class); // be explicit
    conf.setOutputFormat(TextOutputFormat.class);
}

From source file:clusteringblocks.ClusteringBlocks.java

public int run(String[] args) throws Exception {
    Configuration conf = getConf();

    JobConf job = new JobConf(conf, ClusteringBlocks.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("ClusteringBlocks");
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormat(KeyValueTextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    //   job.set("key.value.separator.in.input.line", "");

    JobClient.runJob(job);//from   w w w.ja  v  a2s  .c  o  m

    return 0;
}

From source file:cn.edu.hfut.dmic.webcollectorcluster.fetcher.Fetcher.java

@Override
public int run(String[] args) throws Exception {
    JobConf jc = new JobConf(getConf());
    jc.setJarByClass(Fetcher.class);
    jc.setInputFormat(SequenceFileInputFormat.class);
    Path input = new Path(args[0], "current");
    Path output = new Path(args[1]);
    Configuration conf = CrawlerConfiguration.create();
    FileSystem fs = output.getFileSystem(conf);
    if (fs.exists(output)) {
        fs.delete(output);/*from  w  w w  .j  av  a2  s . c  o m*/
    }
    FileInputFormat.addInputPath(jc, input);
    FileOutputFormat.setOutputPath(jc, output);

    jc.setMapOutputKeyClass(Text.class);
    jc.setMapOutputValueClass(WebWritable.class);

    jc.setMapRunnerClass(Fetcher.class);
    jc.setOutputFormat(FetcherOutputFormat.class);

    JobClient.runJob(jc);
    return 0;
}

From source file:cn.edu.xmu.dm.mapreduce.Sort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job./*from  w  w w. ja  v a2s  .  co m*/
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "Sorter");
    job.setJarByClass(Sort.class);

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}

From source file:co.nubetech.hiho.job.DBQueryInputJob.java

License:Apache License

public void runJobs(Configuration conf, int jobCounter) throws IOException {

    try {/*from   ww  w . j av a2s.co m*/
        checkMandatoryConfs(conf);
    } catch (HIHOException e1) {
        e1.printStackTrace();
        throw new IOException(e1);
    }

    Job job = new Job(conf);
    for (Entry<String, String> entry : conf) {
        logger.warn("key, value " + entry.getKey() + "=" + entry.getValue());
    }

    // logger.debug("Number of maps " +
    // conf.getInt("mapred.map.tasks", 1));
    // conf.setInt(JobContext.NUM_MAPS,
    // conf.getInt("mapreduce.job.maps", 1));
    // job.getConfiguration().setInt("mapred.map.tasks", 4);
    job.getConfiguration().setInt(MRJobConfig.NUM_MAPS, conf.getInt(HIHOConf.NUMBER_MAPPERS, 1));
    logger.warn("Number of maps " + conf.getInt(MRJobConfig.NUM_MAPS, 1));

    job.setJobName("Import job");
    job.setJarByClass(DBQueryInputJob.class);

    String strategy = conf.get(HIHOConf.INPUT_OUTPUT_STRATEGY);
    OutputStrategyEnum os = OutputStrategyEnum.value(strategy);
    if (os == null) {
        throw new IllegalArgumentException("Wrong value of output strategy. Please correct");
    }
    if (os != OutputStrategyEnum.AVRO) {
        switch (os) {

        case DUMP: {
            // job.setMapperClass(DBImportMapper.class);
            break;
        }
        /*
         * case AVRO: { job.setMapperClass(DBInputAvroMapper.class); //
         * need avro in cp // job.setJarByClass(Schema.class); // need
         * jackson which is needed by avro - ugly! //
         * job.setJarByClass(ObjectMapper.class);
         * job.setMapOutputKeyClass(NullWritable.class);
         * job.setMapOutputValueClass(AvroValue.class);
         * job.setOutputKeyClass(NullWritable.class);
         * job.setOutputValueClass(AvroValue.class);
         * job.setOutputFormatClass(AvroOutputFormat.class);
         * 
         * AvroOutputFormat.setOutputPath(job, new
         * Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); break; }
         */
        case DELIMITED: {
            job.setMapperClass(DBInputDelimMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            job.setOutputFormatClass(NoKeyOnlyValueOutputFormat.class);

            NoKeyOnlyValueOutputFormat.setOutputPath(job, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH)));
        }
        case JSON: {
            // job.setMapperClass(DBImportJsonMapper.class);
            // job.setJarByClass(ObjectMapper.class);
            break;
        }
        default: {
            job.setMapperClass(DBInputDelimMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            job.setOutputFormatClass(NoKeyOnlyValueOutputFormat.class);

            NoKeyOnlyValueOutputFormat.setOutputPath(job, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH)));
            break;
        }
        }

        String inputQuery = conf.get(DBConfiguration.INPUT_QUERY);
        String inputBoundingQuery = conf.get(DBConfiguration.INPUT_BOUNDING_QUERY);
        logger.debug("About to set the params");
        DBQueryInputFormat.setInput(job, inputQuery, inputBoundingQuery, params);
        logger.debug("Set the params");

        job.setNumReduceTasks(0);

        try {
            // job.setJarByClass(Class.forName(conf.get(
            // org.apache.hadoop.mapred.lib.db.DBConfiguration.DRIVER_CLASS_PROPERTY)));
            logger.debug("OUTPUT format class is " + job.getOutputFormatClass());

            /*
             * org.apache.hadoop.mapreduce.OutputFormat<?, ?> output =
             * ReflectionUtils.newInstance(job.getOutputFormatClass(),
             * job.getConfiguration()); output.checkOutputSpecs(job);
             */
            logger.debug("Class is " + ReflectionUtils
                    .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName());
            job.waitForCompletion(false);
            if (conf.get(HIHOConf.INPUT_OUTPUT_LOADTO) != null) {
                generateHiveScript(conf, job, jobCounter);
                generatePigScript(conf, job);
            }

        }
        /*
         * catch (HIHOException h) { h.printStackTrace(); }
         */
        catch (Exception e) {
            e.printStackTrace();
        } catch (HIHOException e) {
            e.printStackTrace();
        }
    }
    // avro to be handled differently, thanks to all the incompatibilities
    // in the apis.
    else {
        String inputQuery = conf.get(DBConfiguration.INPUT_QUERY);
        String inputBoundingQuery = conf.get(DBConfiguration.INPUT_BOUNDING_QUERY);
        logger.debug("About to set the params");
        // co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.setInput(job,
        // inputQuery, inputBoundingQuery, params);
        logger.debug("Set the params");

        JobConf jobConf = new JobConf(conf);

        try {
            GenericDBWritable queryWritable = getDBWritable(jobConf);
            Schema pair = DBMapper.getPairSchema(queryWritable.getColumns());

            AvroJob.setMapOutputSchema(jobConf, pair);
            GenericRecordAvroOutputFormat.setOutputPath(jobConf,
                    new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH)));

            co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.setInput(jobConf, inputQuery,
                    inputBoundingQuery, params);
            jobConf.setInputFormat(co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.class);
            jobConf.setMapperClass(DBInputAvroMapper.class);
            jobConf.setMapOutputKeyClass(NullWritable.class);
            jobConf.setMapOutputValueClass(AvroValue.class);
            jobConf.setOutputKeyClass(NullWritable.class);
            jobConf.setOutputValueClass(Text.class);
            jobConf.setOutputFormat(GenericRecordAvroOutputFormat.class);
            jobConf.setJarByClass(DBQueryInputJob.class);
            jobConf.setStrings("io.serializations",
                    "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization,org.apache.avro.mapred.AvroSerialization");
            jobConf.setNumReduceTasks(0);
            /*
             * jobConf.setOutputFormat(org.apache.hadoop.mapred.
             * SequenceFileOutputFormat.class);
             * org.apache.hadoop.mapred.SequenceFileOutputFormat
             * .setOutputPath(jobConf, new
             * Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH)));
             */
            JobClient.runJob(jobConf);
        } catch (Throwable e) {
            e.printStackTrace();
        }

    }

}