Example usage for org.apache.hadoop.mapreduce Job getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getConfiguration.

Prototype

public Configuration getConfiguration()

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.ci.backports.avro.mapreduce.AvroOutputFormat.java

License:Apache License

/** Enable output compression using the deflate codec and specify its level.*/
public static void setDeflateLevel(Job job, int level) {
    FileOutputFormat.setCompressOutput(job, true);
    job.getConfiguration().setInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY, level);
}

From source file:com.ci.backports.hadoop.hbase.ZHFileOutputFormat.java

License:Apache License

/**
 * Configure a MapReduce Job to perform an incremental load into the given
 * table. This/* w w  w  . j  a va2 s  .co m*/
 * <ul>
 *   <li>Inspects the table to configure a total order partitioner</li>
 *   <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li>
 *   <li>Sets the number of reduce tasks to match the current number of regions</li>
 *   <li>Sets the output key/value class to match ZHFileOutputFormat's requirements</li>
 *   <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
 *     ZPutSortReducer)</li>
 * </ul> 
 * The user should be sure to set the map output value class to either KeyValue or Put before
 * running this function.
 */
public static void configureIncrementalLoad(Job job, HTable table) throws IOException {
    Configuration conf = job.getConfiguration();
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(KeyValue.class);
    job.setOutputFormatClass(ZHFileOutputFormat.class);

    // Based on the configured map output class, set the correct reducer to properly
    // sort the incoming values.
    // TODO it would be nice to pick one or the other of these formats.
    if (KeyValue.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(KeyValueSortReducer.class);
    } else if (Put.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(ZPutSortReducer.class);
    } else {
        LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
    }

    LOG.info("Looking up current regions for table " + table);
    List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
    LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count");
    job.setNumReduceTasks(startKeys.size());

    Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis());
    LOG.info("Writing partition information to " + partitionsPath);

    FileSystem fs = partitionsPath.getFileSystem(conf);
    writePartitions(conf, partitionsPath, startKeys);
    partitionsPath.makeQualified(fs);
    URI cacheUri;
    try {
        cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.addCacheFile(cacheUri, conf);
    DistributedCache.createSymlink(conf);

    LOG.info("Incremental table output configured.");
}

From source file:com.citic.zxyjs.zwlscx.mapreduce.lib.input.HFileOutputFormatBase.java

License:Apache License

/**
 * Configure a MapReduce Job to perform an incremental load into the given
 * table. This/*from  ww w. java 2 s.c o m*/
 * <ul>
 * <li>Inspects the table to configure a total order partitioner</li>
 * <li>Uploads the partitions file to the cluster and adds it to the
 * DistributedCache</li>
 * <li>Sets the number of reduce tasks to match the current number of
 * regions</li>
 * <li>Sets the output key/value class to match HFileOutputFormat's
 * requirements</li>
 * <li>Sets the reducer up to perform the appropriate sorting (either
 * KeyValueSortReducer or PutSortReducer)</li>
 * </ul>
 * The user should be sure to set the map output value class to either
 * KeyValue or Put before running this function.
 */
public static void configureIncrementalLoad(Job job, HTable table,
        Class<? extends HFileOutputFormatBase> hfileOutputFormatBase) throws IOException {
    Configuration conf = job.getConfiguration();

    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(KeyValue.class);
    job.setOutputFormatClass(hfileOutputFormatBase);

    // Based on the configured map output class, set the correct reducer to
    // properly
    // sort the incoming values.
    // TODO it would be nice to pick one or the other of these formats.
    if (KeyValue.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(KeyValueSortReducer.class);
    } else if (Put.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(PutSortReducer.class);
    } else if (Text.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(TextSortReducer.class);
    } else {
        LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
    }

    conf.setStrings("io.serializations", conf.get("io.serializations"), MutationSerialization.class.getName(),
            ResultSerialization.class.getName(), KeyValueSerialization.class.getName());

    // Use table's region boundaries for TOP split points.
    LOG.info("Looking up current regions for table " + Bytes.toString(table.getTableName()));
    List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
    LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count");
    job.setNumReduceTasks(startKeys.size());

    configurePartitioner(job, startKeys);
    // Set compression algorithms based on column families
    configureCompression(table, conf);
    configureBloomType(table, conf);
    configureBlockSize(table, conf);

    // TableMapReduceUtil.addDependencyJars(job);
    TableMapReduceUtil.initCredentials(job);
    LOG.info("Incremental table " + Bytes.toString(table.getTableName()) + " output configured.");
}

From source file:com.citic.zxyjs.zwlscx.mapreduce.lib.input.HFileOutputFormatBase.java

License:Apache License

/**
 * Configure <code>job</code> with a TotalOrderPartitioner, partitioning
 * against <code>splitPoints</code>. Cleans up the partitions file after job
 * exists./*  w  w  w. j  av  a  2 s. c  o m*/
 */
static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints) throws IOException {

    // create the partitions file
    FileSystem fs = FileSystem.get(job.getConfiguration());
    Path partitionsPath = new Path("/tmp", "partitions_" + UUID.randomUUID());
    fs.makeQualified(partitionsPath);
    fs.deleteOnExit(partitionsPath);
    writePartitions(job.getConfiguration(), partitionsPath, splitPoints);

    // configure job to use it
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionsPath);
}

From source file:com.cloudera.accumulo.upgrade.compatibility.DataCompatibilityLoad.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    final String jobName = this.getClass().getName();
    options.parseArgs(jobName, args);/*from  w  ww.java2 s  .com*/
    final Job job = new Job(getConf(), jobName);

    if (-1 == options.test.numRows) {
        options.test.numRows = job.getConfiguration().getInt("mapred.map.tasks",
                DataCompatibilityTestCli.DEFAULT_NUM_ROWS);
    }

    job.setJarByClass(this.getClass());

    job.setInputFormatClass(DataLoadInputFormat.class);
    DataLoadInputFormat.setTabletServers(job,
            options.connection.getConnector().instanceOperations().getTabletServers());
    DataLoadInputFormat.setNumRows(job, options.test.numRows);
    DataLoadInputFormat.setNumQualifiersPerFamily(job, options.test.qualifiers);

    job.getConfiguration().set(VISIBILITY, new String(options.visibility.visibility.getExpression(), "UTF-8"));

    final TableOperations ops = options.connection.getConnector().tableOperations();

    final List<String> names = options.test.getTableNamesAndConfigureThem(ops);
    for (String name : names) {
        final int numSplits = ops.getSplits(name, options.test.numRows).size();
        if (options.test.numRows > numSplits) {
            log.info("adding splits to table '" + name + "', to bring it from " + numSplits + " to "
                    + options.test.numRows + ".");
            final SortedSet<Text> splits = new TreeSet<Text>();
            // for cases where we're adding way more splits than there are currently possible servers to handle them, do a pre-pre-split
            //   N.B. If we've just created this table, there will be 0 splits because we'll just have the initial tablet.
            if (0 == numSplits || options.test.numRows / numSplits > 10) {
                log.info("splitting in two waves due to the number of splits we need to add.");
                // TODO turtles all the way down.
                final int prepre = options.test.numRows / (0 == numSplits ? 10 : numSplits * 10);
                for (int i = 0; i < prepre; i++) {
                    splits.add(new Text(new StringBuilder(Long.toString(i)).reverse().toString()));
                }
                ops.addSplits(name, splits);
                log.debug("delay 30s for splits to get assigned off host.");
                try {
                    Thread.currentThread().sleep(30 * 1000);
                } catch (InterruptedException exception) {
                    log.warn("interrupted from sleep early.");
                }
                splits.clear();
            }
            for (int i = 0; i < options.test.numRows; i++) {
                splits.add(new Text(new StringBuilder(Long.toString(i)).reverse().toString()));
            }
            ops.addSplits(name, splits);
        }
    }
    log.debug("delay 30s for splits to get assigned off host.");
    try {
        Thread.currentThread().sleep(30 * 1000);
    } catch (InterruptedException exception) {
        log.warn("interrupted from sleep early.");
    }

    job.getConfiguration().setStrings(OUTPUT_TABLE_NAMES, names.toArray(new String[0]));

    job.setMapperClass(DataLoadMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Mutation.class);

    job.setNumReduceTasks(0);

    log.info("launching map-only job to insert " + options.test.numRows + " rows of "
            + (FAMILIES.length * options.test.qualifiers) + " cells each into each of the tables " + names);
    options.output.useAccumuloOutputFormat(job);

    job.waitForCompletion(true);
    return job.isSuccessful() ? 0 : 1;
}

From source file:com.cloudera.accumulo.upgrade.util.MapreduceInputCli.java

License:Open Source License

/**
 * Iff you use offline mode, you have to call close() when you're done or manually delete the cloned table yourself.
 */// w w  w .jav a  2s .c o  m
public void useAccumuloInputFormat(Job job, String table, boolean offline) throws IOException,
        AccumuloException, AccumuloSecurityException, TableExistsException, TableNotFoundException {
    job.setInputFormatClass(AccumuloInputFormat.class);
    /* XXX Need to use a method that exists in 1.4 adn 1.5  :( */
    Configuration configuration = job.getConfiguration();
    AccumuloInputFormat.setZooKeeperInstance(job, connection.instance, connection.zookeepers);
    final TableOperations ops = connection.getConnector().tableOperations();

    String scan = table;
    if (offline) {
        Random random = new Random();
        scan = table + "_" + String.format("%016x", Math.abs(random.nextLong()));
        ops.clone(table, scan, true, Collections.<String, String>emptyMap(), Collections.<String>emptySet());
        try {
            ops.offline(scan);
        } finally {
            clones.add(scan);
        }
        AccumuloInputFormat.setOfflineTableScan(job, true);
    }

    PasswordToken token = new PasswordToken(connection.password);

    AccumuloInputFormat.setConnectorInfo(job, connection.principal, token);
    AccumuloInputFormat.setInputTableName(job, scan);
    AccumuloInputFormat.setScanAuthorizations(job, connection.auths);

    if (0 < maxMaps) {
        // set up ranges
        try {
            Set<Range> ranges = ops.splitRangeByTablets(table, new Range(), maxMaps);
            AccumuloInputFormat.setRanges(job, ranges);
            AccumuloInputFormat.setAutoAdjustRanges(job, false);
        } catch (Exception e) {
            throw new IOException(e);
        }
    }
}

From source file:com.cloudera.bigdata.analysis.dataload.io.SplitableInputFormat.java

License:Apache License

public static void setReaderClass(Job job, String className) {
    job.getConfiguration().set(RECORD_READER, className);
}

From source file:com.cloudera.bigdata.analysis.dataload.io.SplitableInputFormat.java

License:Apache License

public static void setEncoding(Job job, String encoding) {
    job.getConfiguration().set(ENCODING, encoding);
}

From source file:com.cloudera.bigdata.analysis.dataload.io.SplitableInputFormat.java

License:Apache License

public static void setDelimiter(Job job, String delimiter) {
    job.getConfiguration().set(DELIMITER, delimiter);
}

From source file:com.cloudera.crunch.impl.mr.collect.PGroupedTableImpl.java

License:Open Source License

public void configureShuffle(Job job) {
    ptype.configureShuffle(job, groupingOptions);
    if (groupingOptions != null && groupingOptions.getNumReducers() <= 0) {
        int bytesPerTask = job.getConfiguration().getInt("crunch.bytes.per.reduce.task", (1000 * 1000 * 1000));
        int numReduceTasks = 1 + (int) (getSize() / bytesPerTask);
        job.setNumReduceTasks(numReduceTasks);
        LOG.info(String.format("Setting num reduce tasks to %d", numReduceTasks));
    }/* www .ja v a2 s. c  o  m*/
}