Example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass.

Prototype

public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the OutputFormat for the job.

Usage

From source file:co.nubetech.hiho.mapreduce.lib.db.apache.DBOutputFormat.java

License:Apache License

private static DBConfiguration setOutput(Job job, JobConf jobConf, String tableName) throws IOException {
    job.setOutputFormatClass(DBOutputFormat.class);
    jobConf.setReduceSpeculativeExecution(false);

    DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());

    dbConf.setOutputTableName(tableName);
    return dbConf;
}

From source file:co.nubetech.hiho.mapreduce.lib.db.GenericDBOutputFormat.java

License:Apache License

public static void setOutput(Job job, String tableName, String columnNames) throws IOException {
    job.setOutputFormatClass(GenericDBOutputFormat.class);
    DBConfiguration dbConf = new DBConfiguration(job.getConfiguration());
    dbConf.setOutputTableName(tableName);
    dbConf.setOutputFieldNames(columnNames);

    String dbDriver = job.getConfiguration().get(DBConfiguration.DRIVER_CLASS_PROPERTY);
    String connString = job.getConfiguration().get(DBConfiguration.URL_PROPERTY);
    String username = job.getConfiguration().get(DBConfiguration.USERNAME_PROPERTY);
    String password = job.getConfiguration().get(DBConfiguration.PASSWORD_PROPERTY);

    Connection conn;/* w  w  w .  j a  va2 s. c  om*/
    PreparedStatement stmt;
    try {
        Class.forName(dbDriver).newInstance();
        conn = DriverManager.getConnection(connString, username, password);
        String query = "select " + columnNames + " from " + tableName;
        stmt = conn.prepareStatement(query);
        ResultSetMetaData meta = stmt.getMetaData();
        ArrayList<ColumnInfo> columnInfo = populateColumnInfo(meta);
        String jsonString = getJsonStringOfColumnInfo(columnInfo);
        job.getConfiguration().set(HIHOConf.COLUMN_INFO, jsonString);
        logger.debug("columnInfo is: " + job.getConfiguration().get(HIHOConf.COLUMN_INFO));
        stmt.close();
        conn.close();
    } catch (Exception e) {
        e.printStackTrace();
        throw new IOException(e);
    }

}

From source file:co.nubetech.hiho.merge.MergeJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    populateConfiguration(args);// w  w  w  .  jav a 2 s .c o m
    try {
        checkMandatoryConfs();
    } catch (HIHOException e1) {
        e1.printStackTrace();
        throw new Exception(e1);
    }

    Class inputFormatClass = Class.forName(inputFormat);
    Class outputFormatClass = Class.forName(outputFormat);
    Class inputKeyClass = Class.forName(inputKeyClassName);
    Class inputValueClass = Class.forName(inputValueClassName);

    Configuration conf = getConf();
    conf.set(HIHOConf.MERGE_OLD_PATH, oldPath);
    conf.set(HIHOConf.MERGE_NEW_PATH, newPath);

    Job job = new Job(conf);
    job.setJobName("Merge job");
    job.setJarByClass(MergeJob.class);

    if (mergeBy.equals("key")) {
        job.setMapperClass(MergeKeyMapper.class);
        job.setReducerClass(MergeKeyReducer.class);

    } else if (mergeBy.equals("value")) {
        job.setMapperClass(MergeValueMapper.class);
        job.setReducerClass(MergeValueReducer.class);
    }

    job.setInputFormatClass(inputFormatClass);
    DelimitedTextInputFormat.setProperties(job, delimiter, column);
    job.setMapOutputKeyClass(HihoTuple.class);
    job.setMapOutputValueClass(HihoValue.class);

    job.setOutputKeyClass(inputKeyClass);
    job.setOutputValueClass(inputValueClass);
    FileInputFormat.setInputPaths(job, oldPath + "," + newPath);
    job.setOutputFormatClass(outputFormatClass);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    try {
        logger.debug("Output format class is " + job.getOutputFormatClass());
        logger.debug("Class is " + ReflectionUtils
                .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName());
        job.waitForCompletion(false);
        if (job.isComplete()) {
            Counters counters = job.getCounters();
            totalRecordsOld = counters.findCounter(MergeRecordCounter.TOTAL_RECORDS_OLD).getValue();
            totalRecordsNew = counters.findCounter(MergeRecordCounter.TOTAL_RECORDS_NEW).getValue();
            badRecords = counters.findCounter(MergeRecordCounter.BAD_RECORD).getValue();
            output = counters.findCounter(MergeRecordCounter.OUTPUT).getValue();
            logger.info("Total old records read are: " + totalRecordsOld);
            logger.info("Total new records read are: " + totalRecordsNew);
            logger.info("Bad Records are: " + badRecords);
            logger.info("Output records are: " + output);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    return 0;
}

From source file:co.nubetech.hiho.similarity.ngram.NGramJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    populateConfiguration(args);//from  w  w w. j a  va  2s . c  om
    try {
        checkMandatoryConfs();
    } catch (HIHOException e1) {
        e1.printStackTrace();
        throw new Exception(e1);
    }
    Job job = new Job(conf);
    job.setJobName("NGram job");
    job.setJarByClass(NGramJob.class);

    Class inputFormatClass = Class.forName("org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat");
    Class outputFormatClass = Class.forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat");
    // org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
    // org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
    Class inputKeyClass = Class.forName("org.apache.hadoop.io.Text");
    Class inputValueClass = Class.forName("org.apache.hadoop.io.Text");
    Class outputKeyClass = Class.forName("co.nubetech.hiho.similarity.ngram.ValuePair");
    Class outputValueClass = Class.forName("org.apache.hadoop.io.IntWritable");

    job.setMapperClass(NGramMapper.class);
    job.setReducerClass(NGramReducer.class);

    job.setInputFormatClass(inputFormatClass);
    job.setMapOutputKeyClass(inputKeyClass);
    job.setMapOutputValueClass(inputValueClass);

    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);
    job.setOutputFormatClass(outputFormatClass);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, new Path("outputOfNGramJob"));

    int ret = 0;
    try {
        ret = job.waitForCompletion(true) ? 0 : 1;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return ret;
}

From source file:co.nubetech.hiho.similarity.ngram.ScoreJob.java

License:Apache License

@Override
public int run(String[] arg0) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf);
    job.setJobName("Score job");
    job.setJarByClass(ScoreJob.class);

    Class inputFormatClass = Class.forName("org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat");
    Class outputFormatClass = Class.forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat");
    // org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat
    // org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
    Class inputKeyClass = Class.forName("co.nubetech.hiho.similarity.ngram.ValuePair");
    Class inputValueClass = Class.forName("org.apache.hadoop.io.IntWritable");
    Class outputKeyClass = Class.forName("co.nubetech.hiho.similarity.ngram.ValuePair");
    Class outputValueClass = Class.forName("org.apache.hadoop.io.LongWritable");

    job.setMapperClass(ScoreMapper.class);
    job.setReducerClass(ScoreReducer.class);

    job.setInputFormatClass(inputFormatClass);
    job.setMapOutputKeyClass(inputKeyClass);
    job.setMapOutputValueClass(inputValueClass);

    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);
    job.setOutputFormatClass(outputFormatClass);

    FileInputFormat.setInputPaths(job, "outputOfNGramJob");
    FileOutputFormat.setOutputPath(job, new Path("outputOfScoreJob"));

    int ret = 0;//from   w  w  w. j  a v a  2s  .  c o  m
    try {
        ret = job.waitForCompletion(true) ? 0 : 1;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return ret;
}

From source file:com.abel.hwfs.custom.output.SetSizeDBOutputFormat.java

License:Apache License

private static MyDBConfiguration setOutput(Job job, String tableName) throws IOException {
    job.setOutputFormatClass(SetSizeDBOutputFormat.class);
    job.setReduceSpeculativeExecution(false);

    MyDBConfiguration dbConf = new MyDBConfiguration(job.getConfiguration());

    dbConf.setOutputTableName(tableName);
    return dbConf;
}

From source file:com.accumulobook.advanced.mapreduce.MapReduceFilesExample.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(this.getConf());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(WordCount.WordCountMapper.class);
    job.setCombinerClass(WordCount.WordCountCombiner.class);
    job.setReducerClass(WordCount.WordCountReducer.class);

    // clone the articles table
    ZooKeeperInstance inst = new ZooKeeperInstance(args[0], args[1]);
    Connector conn = inst.getConnector(args[2], new PasswordToken(args[3]));

    conn.tableOperations().clone(WikipediaConstants.ARTICLES_TABLE, WikipediaConstants.ARTICLES_TABLE_CLONE,
            true, Collections.EMPTY_MAP, Collections.EMPTY_SET);

    // take cloned table offline, waiting until the operation is complete
    boolean wait = true;
    conn.tableOperations().offline(WikipediaConstants.ARTICLES_TABLE_CLONE, wait);

    ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]);

    // input/* w ww.  j a  v a 2 s. c  om*/
    job.setInputFormatClass(AccumuloInputFormat.class);
    AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE_CLONE);
    List<Pair<Text, Text>> columns = new ArrayList<>();
    columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text("")));

    AccumuloInputFormat.fetchColumns(job, columns);
    AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));

    // configure to use underlying RFiles
    AccumuloInputFormat.setOfflineTableScan(job, true);

    // output
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    BatchWriterConfig bwConfig = new BatchWriterConfig();

    AccumuloOutputFormat.setBatchWriterOptions(job, bwConfig);
    AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));
    AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE);
    AccumuloOutputFormat.setCreateTables(job, true);

    job.setJarByClass(WordCount.class);

    job.waitForCompletion(true);
    //job.submit();

    return 0;
}

From source file:com.accumulobook.advanced.mapreduce.WordCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(new Configuration());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(WordCountMapper.class);
    job.setCombinerClass(WordCountCombiner.class);
    job.setReducerClass(WordCountReducer.class);

    // input/*w ww .jav  a2 s .com*/
    job.setInputFormatClass(AccumuloInputFormat.class);

    ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]);

    AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE);
    List<Pair<Text, Text>> columns = new ArrayList<>();
    columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text("")));

    AccumuloInputFormat.fetchColumns(job, columns);
    AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));

    // output
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    BatchWriterConfig config = new BatchWriterConfig();

    AccumuloOutputFormat.setBatchWriterOptions(job, config);
    AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));
    AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE);
    AccumuloOutputFormat.setCreateTables(job, true);

    job.setJarByClass(WordCount.class);

    job.submit();
    return 0;
}

From source file:com.ailk.oci.ocnosql.tools.load.single.SingleColumnImportTsv.java

License:Apache License

/**
 * Configure a MapReduce Job to perform an incremental load into the given
 * table. This//  w w w . j ava2s  .  c  om
 * <ul>
 *   <li>Inspects the table to configure a total order partitioner</li>
 *   <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li>
 *   <li>Sets the number of reduce tasks to match the current number of regions</li>
 *   <li>Sets the output key/value class to match HFileOutputFormat's requirements</li>
 *   <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
 *     PutSortReducer)</li>
 * </ul>
 * The user should be sure to set the map output value class to either KeyValue or Put before
 * running this function.
 */
public static void configureIncrementalLoad(Job job, HTable table) throws IOException {
    Configuration conf = job.getConfiguration();
    Class<? extends Partitioner> topClass;
    try {
        topClass = getTotalOrderPartitionerClass();
    } catch (ClassNotFoundException e) {
        throw new IOException("Failed getting TotalOrderPartitioner", e);
    }
    //partition
    job.setPartitionerClass(topClass);
    //Set the key class for the job output data
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    //Set the value class for job outputs
    job.setOutputValueClass(KeyValue.class);
    //outputformatHfile
    job.setOutputFormatClass(HFileOutputFormat2.class);

    // Based on the configured map output class, set the correct reducer to properly
    // sort the incoming values.
    // TODO it would be nice to pick one or the other of these formats.
    if (KeyValue.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(KeyValueSortReducer.class);
    } else if (Put.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(SingleColumnReducer.class);
    } else {
        LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
    }

    LOG.info("Looking up current regions for table " + table);
    //?regionstarkey
    List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
    LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count");

    //?region?reduce?
    job.setNumReduceTasks(startKeys.size());

    Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + UUID.randomUUID());
    LOG.info("Writing partition information to " + partitionsPath);

    FileSystem fs = partitionsPath.getFileSystem(conf);
    writePartitions(conf, partitionsPath, startKeys);
    partitionsPath.makeQualified(fs);

    URI cacheUri;
    try {
        // Below we make explicit reference to the bundled TOP.  Its cheating.
        // We are assume the define in the hbase bundled TOP is as it is in
        // hadoop (whether 0.20 or 0.22, etc.)
        /*
          cacheUri = new URI(partitionsPath.toString() + "#" +
            org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner.DEFAULT_PATH);
            */
        cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.addCacheFile(cacheUri, conf);
    DistributedCache.createSymlink(conf);

    // Set compression algorithms based on column families
    configureCompression(table, conf);

    TableMapReduceUtil.addDependencyJars(job);
    LOG.info("Incremental table output configured.");
}

From source file:com.alexholmes.hadooputils.combine.avro.mapreduce.CombineAvroKeyValueInputFormatTest.java

License:Apache License

@Test
public void testKeyValueInput() throws ClassNotFoundException, IOException, InterruptedException {
    // Create a test input file.
    File inputFile = createInputFile();

    // Configure the job input.
    Job job = new Job();
    FileInputFormat.setInputPaths(job, new Path(inputFile.getAbsolutePath()));
    job.setInputFormatClass(CombineAvroKeyValueInputFormat.class);
    AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.INT));
    AvroJob.setInputValueSchema(job, Schema.create(Schema.Type.STRING));

    // Configure a mapper.
    job.setMapperClass(IndexMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    // Configure a reducer.
    job.setReducerClass(IndexReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(AvroValue.class);
    AvroJob.setOutputValueSchema(job, Schema.createArray(Schema.create(Schema.Type.INT)));

    // Configure the output format.
    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    Path outputPath = new Path(mTempDir.getRoot().getPath(), "out-index");
    FileOutputFormat.setOutputPath(job, outputPath);

    // Run the job.
    assertTrue(job.waitForCompletion(true));

    // Verify that the output Avro container file as the expected data.
    File avroFile = new File(outputPath.toString(), "part-r-00000.avro");
    DatumReader<GenericRecord> datumReader = new SpecificDatumReader<GenericRecord>(AvroKeyValue
            .getSchema(Schema.create(Schema.Type.STRING), Schema.createArray(Schema.create(Schema.Type.INT))));
    DataFileReader<GenericRecord> avroFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader);
    assertTrue(avroFileReader.hasNext());

    AvroKeyValue<CharSequence, List<Integer>> appleRecord = new AvroKeyValue<CharSequence, List<Integer>>(
            avroFileReader.next());/*from ww  w.  jav a 2s . c o  m*/
    assertNotNull(appleRecord.get());
    assertEquals("apple", appleRecord.getKey().toString());
    List<Integer> appleDocs = appleRecord.getValue();
    assertEquals(3, appleDocs.size());
    assertTrue(appleDocs.contains(1));
    assertTrue(appleDocs.contains(2));
    assertTrue(appleDocs.contains(3));

    assertTrue(avroFileReader.hasNext());
    AvroKeyValue<CharSequence, List<Integer>> bananaRecord = new AvroKeyValue<CharSequence, List<Integer>>(
            avroFileReader.next());
    assertNotNull(bananaRecord.get());
    assertEquals("banana", bananaRecord.getKey().toString());
    List<Integer> bananaDocs = bananaRecord.getValue();
    assertEquals(2, bananaDocs.size());
    assertTrue(bananaDocs.contains(1));
    assertTrue(bananaDocs.contains(2));

    assertTrue(avroFileReader.hasNext());
    AvroKeyValue<CharSequence, List<Integer>> carrotRecord = new AvroKeyValue<CharSequence, List<Integer>>(
            avroFileReader.next());
    assertEquals("carrot", carrotRecord.getKey().toString());
    List<Integer> carrotDocs = carrotRecord.getValue();
    assertEquals(1, carrotDocs.size());
    assertTrue(carrotDocs.contains(1));

    assertFalse(avroFileReader.hasNext());
    avroFileReader.close();
}