Example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass.

Prototype

public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the OutputFormat for the job.

Usage

From source file:com.soteradefense.dga.louvain.mapreduce.CommunityCompression.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration mrConf = this.getConf();
    for (java.util.Map.Entry<String, String> entry : dgaConfiguration.getSystemProperties().entrySet()) {
        mrConf.set(entry.getKey(), entry.getValue());
    }/*from w  w  w  .  ja  va 2s .  c om*/

    Job job = Job.getInstance(mrConf);
    job.setJarByClass(CommunityCompression.class);
    Path in = new Path(inputPath);
    Path out = new Path(outputPath);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setJobName("CommunityCompression");

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LouvainVertexWritable.class);

    job.setMapperClass(CommunityCompression.Map.class);
    job.setReducerClass(CommunityCompression.Reduce.class);

    logger.debug("Running Mapreduce step with job configuration: {}", job);

    return job.waitForCompletion(false) ? 0 : 1;
}

From source file:com.soteradefense.dga.louvain.mapreduce.LouvainTableSynthesizer.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = null;
    try {//from ww w .  j a va  2 s .  co  m
        int iteration = 0;
        if (!basePath.endsWith("/"))
            basePath = basePath + "/";
        String inputPath = basePath + GIRAPH_FOLDER_BASE_NAME + FILE_NAME_SEPARATOR + iteration;
        String joinPath = basePath + GIRAPH_FOLDER_BASE_NAME + FILE_NAME_SEPARATOR + (iteration + 1);
        String outputPath = basePath + TABLE_BASE_NAME + FILE_NAME_SEPARATOR + iteration;
        Configuration mrConf = this.getConf();
        job = Job.getInstance(mrConf);

        for (Map.Entry<String, String> entry : dgaConfiguration.getSystemProperties().entrySet()) {
            mrConf.set(entry.getKey(), entry.getValue());
        }

        FileSystem fs = FileSystem.get(job.getConfiguration());
        boolean nextFileExists = fs.exists(new Path(joinPath));
        while (nextFileExists) {
            System.out.println("Processing " + inputPath + " and " + joinPath);
            job = Job.getInstance(mrConf);
            job.setJobName("Louvain Table Synthesizer " + iteration);

            job.setJarByClass(LouvainTableSynthesizer.class);

            job.setMapperClass(LouvainTableSynthesizerMapper.class);
            job.setReducerClass(LouvainTableSynthesizerReducer.class);

            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);

            //Reducer Output
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);

            //Add both input folders
            Path in = new Path(inputPath);
            Path joinIn = new Path(joinPath);
            Path out = new Path(outputPath);
            FileInputFormat.addInputPath(job, in);
            FileInputFormat.addInputPath(job, joinIn);
            FileOutputFormat.setOutputPath(job, out);

            job.waitForCompletion(true);
            //Set the new temp input path
            inputPath = outputPath;
            iteration++;
            outputPath = basePath + TABLE_BASE_NAME + FILE_NAME_SEPARATOR + iteration;
            joinPath = basePath + GIRAPH_FOLDER_BASE_NAME + FILE_NAME_SEPARATOR + (iteration + 1);
            nextFileExists = fs.exists(new Path(joinPath));
        }

    } catch (IOException e) {
        e.printStackTrace();
        return -1;
    } catch (InterruptedException e) {
        e.printStackTrace();
        return -1;
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
        return -1;
    }
    return 0;
}

From source file:com.soteradefense.dga.LouvainRunner.java

License:Apache License

private int runMapreduceJob(String inputPath, String outputPath, DGAConfiguration conf) throws Exception {
    Configuration mrConf = new Configuration();
    for (Map.Entry<String, String> entry : conf.getSystemProperties().entrySet()) {
        mrConf.set(entry.getKey(), entry.getValue());
    }/*w  w w  .ja v a 2s. co  m*/

    Job job = Job.getInstance(configuration);
    job.setJarByClass(LouvainRunner.class);
    Path in = new Path(inputPath);
    Path out = new Path(outputPath);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setJobName("CommunityCompression");

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LouvainVertexWritable.class);

    job.setMapperClass(CommunityCompression.Map.class);
    job.setReducerClass(CommunityCompression.Reduce.class);

    logger.debug("Running Mapreduce step with job configuration: {}", job);

    return job.waitForCompletion(false) ? 0 : 1;
}

From source file:com.splicemachine.derby.stream.spark.SparkExportDataSetWriter.java

License:Apache License

@Override
public DataSet<LocatedRow> write() throws StandardException {
    Configuration conf = new Configuration(HConfiguration.unwrapDelegate());
    ByteDataOutput bdo = new ByteDataOutput();
    Job job;
    String encoded;//from  ww  w.j a v a 2  s .com

    try {
        bdo.writeObject(exportFunction);
        encoded = Base64.encodeBase64String(bdo.toByteArray());
        conf.set("exportFunction", encoded);
        job = Job.getInstance(conf);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    job.setOutputKeyClass(Void.class);
    job.setOutputValueClass(LocatedRow.class);
    job.setOutputFormatClass(SparkDataSet.EOutputFormat.class);
    job.getConfiguration().set("mapred.output.dir", directory);

    JavaRDD<V> cached = rdd.cache();
    long writtenRows = cached.count();
    rdd.keyBy(new NullFunction<V>()).setName(String.format("Export Directory: %s", directory))
            .saveAsNewAPIHadoopDataset(job.getConfiguration());
    cached.unpersist();

    ValueRow valueRow = new ValueRow(2);
    valueRow.setColumn(1, new SQLLongint(writtenRows));
    valueRow.setColumn(2, new SQLInteger(0));
    return new SparkDataSet<>(
            SpliceSpark.getContext().parallelize(Collections.singletonList(new LocatedRow(valueRow)), 1));
}

From source file:com.splicemachine.mrio.api.SpliceTableMapReduceUtil.java

License:Apache License

/**
 * Use this before submitting a TableReduce job. It will
 * appropriately set up the JobConf./*from ww w.  j  a v  a2 s.  c  o  m*/
 *
 * @param table  The output Splice table name, The format should be Schema.tableName.
 * @param reducer  The reducer class to use.
 * @param job  The current job to adjust.  Make sure the passed job is
 * carrying all necessary configuration.
 * @param partitioner  Partitioner to use. Pass <code>null</code> to use
 * default partitioner.
 * @param quorumAddress Distant cluster to write to; default is null for
 * output to the cluster that is designated in <code>hbase-site.xml</code>.
 * Set this String to the zookeeper ensemble of an alternate remote cluster
 * when you would have the reduce write a cluster that is other than the
 * default; e.g. copying tables between clusters, the source would be
 * designated by <code>hbase-site.xml</code> and this param would have the
 * ensemble address of the remote cluster.  The format to pass is particular.
 * Pass <code> &lt;hbase.zookeeper.quorum>:&lt;hbase.zookeeper.client.port>:&lt;zookeeper.znode.parent>
 * </code> such as <code>server,server2,server3:2181:/hbase</code>.
 * @param serverClass redefined hbase.regionserver.class
 * @param serverImpl redefined hbase.regionserver.client
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 * @throws IOException When determining the region count fails.
 * @throws SQLException
 */
public static void initTableReducerJob(String table, Class<? extends Reducer> reducer, Job job,
        Class partitioner, String quorumAddress, String serverClass, String serverImpl,
        boolean addDependencyJars, Class<? extends OutputFormat> outputformatClass) throws IOException {

    Configuration conf = job.getConfiguration();
    job.setOutputFormatClass(outputformatClass);
    if (reducer != null)
        job.setReducerClass(reducer);
    conf.set(MRConstants.SPLICE_OUTPUT_TABLE_NAME, table);
    if (sqlUtil == null)
        sqlUtil = SMSQLUtil.getInstance(conf.get(MRConstants.SPLICE_JDBC_STR));
    // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
    String hbaseTableID = null;
    try {
        hbaseTableID = sqlUtil.getConglomID(table);
    } catch (SQLException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        throw new IOException(e);
    }
    conf.set(MRConstants.HBASE_OUTPUT_TABLE_NAME, table);

    if (quorumAddress != null) {
        // Calling this will validate the format
        HBasePlatformUtils.validateClusterKey(quorumAddress);
        conf.set(TableOutputFormat.QUORUM_ADDRESS, quorumAddress);
    }
    if (serverClass != null && serverImpl != null) {
        conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
        conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);

    }
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Object.class);
    if (partitioner == HRegionPartitioner.class) {
        job.setPartitionerClass(HRegionPartitioner.class);
        // TODO Where are the keys?
        int regions = getReduceNumberOfRegions(hbaseTableID);
        if (job.getNumReduceTasks() > regions) {
            job.setNumReduceTasks(regions);
        }
    } else if (partitioner != null) {
        job.setPartitionerClass(partitioner);
    }

    if (addDependencyJars) {
        addDependencyJars(job);
    }

    //initCredentials(job);
}

From source file:com.spotify.hdfs2cass.crunch.cql.CQLTarget.java

License:Open Source License

@Override
public void configureForMapReduce(final Job job, final PType<?> pType, final Path outputPath,
        final String name) {

    if (name == null) {
        throw new CrunchRuntimeException("'name' arguments should not be null. We don't know why tho");
    }//from ww  w. j av  a 2s  .c om

    FileOutputFormat.setOutputPath(job, outputPath);
    job.setOutputFormatClass(CrunchCqlBulkOutputFormat.class);

    JobConf conf = new JobConf();
    params.configure(conf);

    for (Map.Entry<String, String> e : extraConf.entrySet()) {
        conf.set(e.getKey(), e.getValue());
    }

    FormatBundle<CrunchCqlBulkOutputFormat> bundle = FormatBundle.forOutput(CrunchCqlBulkOutputFormat.class);
    for (Map.Entry<String, String> e : conf) {
        bundle.set(e.getKey(), e.getValue());
    }

    Configuration jobConfiguration = job.getConfiguration();

    // we don't know why exactly this is needed, but without this, the actual streaming will not
    // see the the throttling and buffer size arguments
    params.configure(jobConfiguration);

    CrunchConfigHelper.setOutputColumnFamily(jobConfiguration, params.getKeyspace(), params.getColumnFamily());
    CrunchCqlBulkOutputFormat.setColumnFamilySchema(jobConfiguration, params.getColumnFamily(),
            params.getSchema());
    CrunchCqlBulkOutputFormat.setColumnFamilyInsertStatement(jobConfiguration, params.getColumnFamily(),
            params.getStatement());

    String[] colNames = params.getColumnNames();
    for (int i = 0; i < colNames.length; i++) {
        CrunchCqlBulkOutputFormat.setColumnIndex(jobConfiguration, params.getColumnFamily(), colNames[i], i);
    }

    CrunchOutputs.addNamedOutput(job, name, bundle, ByteBuffer.class, List.class);
}

From source file:com.spotify.hdfs2cass.crunch.thrift.ThriftTarget.java

License:Open Source License

@Override
public void configureForMapReduce(final Job job, final PType<?> pType, final Path outputPath,
        final String name) {

    if (name == null) {
        throw new CrunchRuntimeException("'name' arguments should not be null. We don't know why tho");
    }//from w w w. ja  v a 2s  .  co m

    FileOutputFormat.setOutputPath(job, outputPath);
    job.setOutputFormatClass(CrunchBulkOutputFormat.class);

    JobConf conf = new JobConf();
    params.configure(conf);

    for (Map.Entry<String, String> e : extraConf.entrySet()) {
        conf.set(e.getKey(), e.getValue());
    }

    FormatBundle<CrunchBulkOutputFormat> bundle = FormatBundle.forOutput(CrunchBulkOutputFormat.class);
    for (Map.Entry<String, String> e : conf) {
        bundle.set(e.getKey(), e.getValue());
    }

    Configuration jobConfiguration = job.getConfiguration();

    // we don't know why exactly this is needed, but without this, the actual streaming will not
    // see the the throttling and buffer size arguments
    params.configure(jobConfiguration);

    CrunchConfigHelper.setOutputColumnFamily(jobConfiguration, params.getKeyspace(), params.getColumnFamily());

    CrunchOutputs.addNamedOutput(job, name, bundle, ByteBuffer.class, List.class);
}

From source file:com.springsource.insight.plugin.hadoop.WordCount.java

License:Open Source License

public int run(String[] args) throws Exception {
    String INPUT = "src/test/resources";
    String OUTPUT = "target/out";

    Configuration conf = new Configuration();
    File targetFolder = FileUtil.detectTargetFolder(getClass());
    if (targetFolder == null) {
        throw new IllegalStateException("Cannot detect target folder");
    }//from   ww w  .  j  a  v a  2  s .  com
    File tempFolder = new File(targetFolder, "temp");
    conf.set("hadoop.tmp.dir", tempFolder.getAbsolutePath());

    Job job = new Job(conf, "wordcount");
    job.setJarByClass(WordCount.class);

    job.setMapperClass(WordCountMapper.class);
    job.setCombinerClass(WordCountReducer.class);
    job.setReducerClass(WordCountReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileUtils.deleteDirectory(new File(OUTPUT)); // delete old output data
    FileInputFormat.addInputPath(job, new Path(INPUT));
    FileOutputFormat.setOutputPath(job, new Path(OUTPUT));

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroconvert.AvroConversionBaseCreator.java

License:Apache License

@Override
public Job call() throws Exception {
    // We're explicitly disabling speculative execution
    conf.set("mapreduce.map.speculative", "false");
    conf.set("mapreduce.map.maxattempts", "1");

    conf.set("mapreduce.job.user.classpath.first", "true");
    conf.set("mapreduce.task.classpath.user.precedence", "true");
    conf.set("mapreduce.task.classpath.first", "true");

    addNecessaryJarsToJob(conf);//from  w  w  w  .  jav a 2  s.  c o m

    Job job = Job.getInstance(conf);

    // IO formats
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(NullOutputFormat.class);

    // Mapper & job output
    job.setMapperClass(getMapperClass());
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // It's map only job
    job.setNumReduceTasks(0);

    // General configuration
    job.setJarByClass(getClass());

    return job;
}

From source file:com.stride.cartrek.core.hbase.RowKeyDistributorTestBase.java

License:Apache License

private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue,
        int seekIntervalMinValue, int seekIntervalMaxValue)
        throws IOException, InterruptedException, ClassNotFoundException {
    int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue,
            seekIntervalMinValue, seekIntervalMaxValue);

    // Reading data
    Configuration conf = testingUtility.getConfiguration();
    Job job = new Job(conf, "testMapReduceInternal()-Job");
    job.setJarByClass(this.getClass());
    TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class,
            ImmutableBytesWritable.class, Result.class, job);

    // Substituting standard TableInputFormat which was set in
    // TableMapReduceUtil.initTableMapperJob(...)
    job.setInputFormatClass(WdTableInputFormat.class);
    keyDistributor.addInfo(job.getConfiguration());

    job.setOutputFormatClass(NullOutputFormat.class);
    job.setNumReduceTasks(0);//from  w ww  .  j ava 2s . co  m

    boolean succeeded = job.waitForCompletion(true);
    Assert.assertTrue(succeeded);

    long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue();
    Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords);
}