Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:org.apache.accumulo.examples.mapreduce.bulk.BulkIngestExample.java

License:Apache License

@Override
public int run(String[] args) {
    Opts opts = new Opts();
    opts.parseArgs(BulkIngestExample.class.getName(), args);

    Configuration conf = getConf();
    PrintStream out = null;//from   w  w  w .  ja  v  a 2  s. c  o m
    try {
        Job job = Job.getInstance(conf);
        job.setJobName("bulk ingest example");
        job.setJarByClass(this.getClass());

        job.setInputFormatClass(TextInputFormat.class);

        job.setMapperClass(MapClass.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(ReduceClass.class);
        job.setOutputFormatClass(AccumuloFileOutputFormat.class);
        opts.setAccumuloConfigs(job);

        Connector connector = opts.getConnector();

        TextInputFormat.setInputPaths(job, new Path(opts.inputDir));
        AccumuloFileOutputFormat.setOutputPath(job, new Path(opts.workDir + "/files"));

        FileSystem fs = FileSystem.get(conf);
        out = new PrintStream(new BufferedOutputStream(fs.create(new Path(opts.workDir + "/splits.txt"))));

        Collection<Text> splits = connector.tableOperations().listSplits(opts.getTableName(), 100);
        for (Text split : splits)
            out.println(Base64.getEncoder().encodeToString(TextUtil.getBytes(split)));

        job.setNumReduceTasks(splits.size() + 1);
        out.close();

        job.setPartitionerClass(RangePartitioner.class);
        RangePartitioner.setSplitFile(job, opts.workDir + "/splits.txt");

        job.waitForCompletion(true);
        Path failures = new Path(opts.workDir, "failures");
        fs.delete(failures, true);
        fs.mkdirs(new Path(opts.workDir, "failures"));
        // With HDFS permissions on, we need to make sure the Accumulo user can read/move the rfiles
        FsShell fsShell = new FsShell(conf);
        fsShell.run(new String[] { "-chmod", "-R", "777", opts.workDir });
        connector.tableOperations().importDirectory(opts.getTableName(), opts.workDir + "/files",
                opts.workDir + "/failures", false);

    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        if (out != null)
            out.close();
    }

    return 0;
}

From source file:org.apache.accumulo.examples.simple.mapreduce.bulk.BulkIngestExample.java

License:Apache License

public int run(String[] args) {
    if (args.length != 7) {
        System.out.println("ERROR: Wrong number of parameters: " + args.length + " instead of 7.");
        return printUsage();
    }/*ww  w  .  j a va2 s .c  o m*/

    Configuration conf = getConf();
    PrintStream out = null;
    try {
        Job job = new Job(conf, "bulk ingest example");
        job.setJarByClass(this.getClass());

        job.setInputFormatClass(TextInputFormat.class);

        job.setMapperClass(MapClass.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(ReduceClass.class);
        job.setOutputFormatClass(AccumuloFileOutputFormat.class);

        Instance instance = new ZooKeeperInstance(args[0], args[1]);
        String user = args[2];
        byte[] pass = args[3].getBytes();
        String tableName = args[4];
        String inputDir = args[5];
        String workDir = args[6];

        Connector connector = instance.getConnector(user, pass);

        TextInputFormat.setInputPaths(job, new Path(inputDir));
        AccumuloFileOutputFormat.setOutputPath(job, new Path(workDir + "/files"));

        FileSystem fs = FileSystem.get(conf);
        out = new PrintStream(new BufferedOutputStream(fs.create(new Path(workDir + "/splits.txt"))));

        Collection<Text> splits = connector.tableOperations().getSplits(tableName, 100);
        for (Text split : splits)
            out.println(new String(Base64.encodeBase64(TextUtil.getBytes(split))));

        job.setNumReduceTasks(splits.size() + 1);
        out.close();

        job.setPartitionerClass(RangePartitioner.class);
        RangePartitioner.setSplitFile(job, workDir + "/splits.txt");

        job.waitForCompletion(true);
        Path failures = new Path(workDir, "failures");
        fs.delete(failures, true);
        fs.mkdirs(new Path(workDir, "failures"));
        connector.tableOperations().importDirectory(tableName, workDir + "/files", workDir + "/failures",
                false);

    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        if (out != null)
            out.close();
    }

    return 0;
}

From source file:org.apache.accumulo.examples.simple.mapreduce.bulk.BulkIngestExample.java

License:Apache License

@Override
public int run(String[] args) {
    Opts opts = new Opts();
    opts.parseArgs(BulkIngestExample.class.getName(), args);

    Configuration conf = getConf();
    PrintStream out = null;/*from  ww w . j  a v  a2s.c om*/
    try {
        Job job = JobUtil.getJob(conf);
        job.setJobName("bulk ingest example");
        job.setJarByClass(this.getClass());

        job.setInputFormatClass(TextInputFormat.class);

        job.setMapperClass(MapClass.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(ReduceClass.class);
        job.setOutputFormatClass(AccumuloFileOutputFormat.class);
        opts.setAccumuloConfigs(job);

        Connector connector = opts.getConnector();

        TextInputFormat.setInputPaths(job, new Path(opts.inputDir));
        AccumuloFileOutputFormat.setOutputPath(job, new Path(opts.workDir + "/files"));

        FileSystem fs = FileSystem.get(conf);
        out = new PrintStream(new BufferedOutputStream(fs.create(new Path(opts.workDir + "/splits.txt"))));

        Collection<Text> splits = connector.tableOperations().listSplits(opts.getTableName(), 100);
        for (Text split : splits)
            out.println(Base64.encodeBase64String(TextUtil.getBytes(split)));

        job.setNumReduceTasks(splits.size() + 1);
        out.close();

        job.setPartitionerClass(RangePartitioner.class);
        RangePartitioner.setSplitFile(job, opts.workDir + "/splits.txt");

        job.waitForCompletion(true);
        Path failures = new Path(opts.workDir, "failures");
        fs.delete(failures, true);
        fs.mkdirs(new Path(opts.workDir, "failures"));
        connector.tableOperations().importDirectory(opts.getTableName(), opts.workDir + "/files",
                opts.workDir + "/failures", false);

    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        if (out != null)
            out.close();
    }

    return 0;
}

From source file:org.apache.accumulo.server.test.randomwalk.shard.SortTool.java

License:Apache License

public int run(String[] args) throws Exception {
    Job job = new Job(getConf(), this.getClass().getSimpleName());
    job.setJarByClass(this.getClass());

    if (job.getJar() == null) {
        log.error("M/R requires a jar file!  Run mvn package.");
        return 1;
    }//from  ww w. jav  a  2  s  . co  m

    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.setInputPaths(job, seqFile);

    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitFile);

    job.setMapOutputKeyClass(Key.class);
    job.setMapOutputValueClass(Value.class);

    job.setNumReduceTasks(splits.size() + 1);

    job.setOutputFormatClass(AccumuloFileOutputFormat.class);
    AccumuloFileOutputFormat.setOutputPath(job, new Path(outputDir));

    job.waitForCompletion(true);
    return job.isSuccessful() ? 0 : 1;
}

From source file:org.apache.accumulo.test.randomwalk.shard.SortTool.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf(), this.getClass().getSimpleName());
    job.setJarByClass(this.getClass());

    if (job.getJar() == null) {
        log.error("M/R requires a jar file!  Run mvn package.");
        return 1;
    }/*from   w  w  w.  jav  a  2 s  . c  om*/

    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.setInputPaths(job, seqFile);

    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitFile);

    job.setMapOutputKeyClass(Key.class);
    job.setMapOutputValueClass(Value.class);

    job.setNumReduceTasks(splits.size() + 1);

    job.setOutputFormatClass(AccumuloFileOutputFormat.class);
    AccumuloFileOutputFormat.setOutputPath(job, new Path(outputDir));

    job.waitForCompletion(true);
    return job.isSuccessful() ? 0 : 1;
}

From source file:org.apache.blur.mapreduce.lib.update.Driver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    int c = 0;/*from  ww  w  . ja  va2s .  c  o  m*/
    if (args.length < 5) {
        System.err.println(
                "Usage Driver <table> <mr inc working path> <output path> <zk connection> <reducer multipler> <extra config files...>");
    }
    String table = args[c++];
    String mrIncWorkingPathStr = args[c++];
    String outputPathStr = args[c++];
    String blurZkConnection = args[c++];
    int reducerMultipler = Integer.parseInt(args[c++]);
    for (; c < args.length; c++) {
        String externalConfigFileToAdd = args[c];
        getConf().addResource(new Path(externalConfigFileToAdd));
    }

    Path outputPath = new Path(outputPathStr);
    Path mrIncWorkingPath = new Path(mrIncWorkingPathStr);
    FileSystem fileSystem = mrIncWorkingPath.getFileSystem(getConf());

    Path newData = new Path(mrIncWorkingPath, NEW);
    Path inprogressData = new Path(mrIncWorkingPath, INPROGRESS);
    Path completeData = new Path(mrIncWorkingPath, COMPLETE);
    Path fileCache = new Path(mrIncWorkingPath, CACHE);

    fileSystem.mkdirs(newData);
    fileSystem.mkdirs(inprogressData);
    fileSystem.mkdirs(completeData);
    fileSystem.mkdirs(fileCache);

    List<Path> srcPathList = new ArrayList<Path>();
    for (FileStatus fileStatus : fileSystem.listStatus(newData)) {
        srcPathList.add(fileStatus.getPath());
    }
    if (srcPathList.isEmpty()) {
        return 0;
    }

    List<Path> inprogressPathList = new ArrayList<Path>();
    boolean success = false;
    Iface client = null;
    try {
        inprogressPathList = movePathList(fileSystem, inprogressData, srcPathList);

        Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]");
        client = BlurClient.getClientFromZooKeeperConnectionStr(blurZkConnection);
        waitForOtherSnapshotsToBeRemoved(client, table, MRUPDATE_SNAPSHOT);
        client.createSnapshot(table, MRUPDATE_SNAPSHOT);
        TableDescriptor descriptor = client.describe(table);
        Path tablePath = new Path(descriptor.getTableUri());

        BlurInputFormat.setLocalCachePath(job, fileCache);
        BlurInputFormat.addTable(job, descriptor, MRUPDATE_SNAPSHOT);
        MultipleInputs.addInputPath(job, tablePath, BlurInputFormat.class, MapperForExistingData.class);
        for (Path p : inprogressPathList) {
            FileInputFormat.addInputPath(job, p);
            MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, MapperForNewData.class);
        }

        BlurOutputFormat.setOutputPath(job, outputPath);
        BlurOutputFormat.setupJob(job, descriptor);

        job.setReducerClass(UpdateReducer.class);
        job.setMapOutputKeyClass(IndexKey.class);
        job.setMapOutputValueClass(IndexValue.class);
        job.setPartitionerClass(IndexKeyPartitioner.class);
        job.setGroupingComparatorClass(IndexKeyWritableComparator.class);

        BlurOutputFormat.setReducerMultiplier(job, reducerMultipler);

        success = job.waitForCompletion(true);
        Counters counters = job.getCounters();
        LOG.info("Counters [" + counters + "]");

    } finally {
        if (success) {
            LOG.info("Indexing job succeeded!");
            movePathList(fileSystem, completeData, inprogressPathList);
        } else {
            LOG.error("Indexing job failed!");
            movePathList(fileSystem, newData, inprogressPathList);
        }
        if (client != null) {
            client.removeSnapshot(table, MRUPDATE_SNAPSHOT);
        }
    }

    if (success) {
        return 0;
    } else {
        return 1;
    }

}

From source file:org.apache.crunch.GroupingOptions.java

License:Apache License

public void configure(Job job) {
    if (partitionerClass != null) {
        job.setPartitionerClass(partitionerClass);
    }/*from www. ja  v  a 2  s  .c  o m*/
    if (groupingComparatorClass != null) {
        job.setGroupingComparatorClass(groupingComparatorClass);
    }
    if (sortComparatorClass != null) {
        job.setSortComparatorClass(sortComparatorClass);
    }
    if (numReducers > 0) {
        job.setNumReduceTasks(numReducers);
    }
    for (Map.Entry<String, String> e : extraConf.entrySet()) {
        job.getConfiguration().set(e.getKey(), e.getValue());
    }
}

From source file:org.apache.druid.indexer.SortableBytes.java

License:Apache License

public static void useSortableBytesAsMapOutputKey(Job job, Class<? extends Partitioner> partitionerClass) {
    job.setMapOutputKeyClass(BytesWritable.class);
    job.setGroupingComparatorClass(SortableBytesGroupingComparator.class);
    job.setSortComparatorClass(SortableBytesSortingComparator.class);
    job.setPartitionerClass(partitionerClass);
}

From source file:org.apache.gora.mapreduce.GoraMapper.java

License:Apache License

/**
 * Initializes the Mapper, and sets input parameters for the job. All of 
 * the records in the dataStore are used as the input. If you want to 
 * include a specific subset, use one of the overloaded methods which takes
 * query parameter.//w  ww.jav a  2  s  .  co m
 * @param job the job to set the properties for
 * @param dataStoreClass the datastore class
 * @param inKeyClass Map input key class
 * @param inValueClass Map input value class
 * @param outKeyClass Map output key class
 * @param outValueClass Map output value class
 * @param mapperClass the mapper class extending GoraMapper
 * @param partitionerClass optional partitioner class
 * @param reuseObjects whether to reuse objects in serialization
 */
@SuppressWarnings("rawtypes")
public static <K1, V1 extends Persistent, K2, V2> void initMapperJob(Job job,
        Class<? extends DataStore<K1, V1>> dataStoreClass, Class<K1> inKeyClass, Class<V1> inValueClass,
        Class<K2> outKeyClass, Class<V2> outValueClass, Class<? extends GoraMapper> mapperClass,
        Class<? extends Partitioner> partitionerClass, boolean reuseObjects) throws IOException {

    //set the input via GoraInputFormat
    GoraInputFormat.setInput(job, dataStoreClass, inKeyClass, inValueClass, reuseObjects);

    job.setMapperClass(mapperClass);
    job.setMapOutputKeyClass(outKeyClass);
    job.setMapOutputValueClass(outValueClass);

    if (partitionerClass != null) {
        job.setPartitionerClass(partitionerClass);
    }
}

From source file:org.apache.gora.mapreduce.GoraMapper.java

License:Apache License

/**
 * Initializes the Mapper, and sets input parameters for the job
 * @param job the job to set the properties for
 * @param query the query to get the inputs from
 * @param dataStore the datastore as the input
 * @param outKeyClass Map output key class
 * @param outValueClass Map output value class
 * @param mapperClass the mapper class extending GoraMapper
 * @param partitionerClass optional partitioner class
 * @param reuseObjects whether to reuse objects in serialization
 *//*from  w  ww  .  j a v  a 2s.co m*/
@SuppressWarnings("rawtypes")
public static <K1, V1 extends Persistent, K2, V2> void initMapperJob(Job job, Query<K1, V1> query,
        DataStore<K1, V1> dataStore, Class<K2> outKeyClass, Class<V2> outValueClass,
        Class<? extends GoraMapper> mapperClass, Class<? extends Partitioner> partitionerClass,
        boolean reuseObjects) throws IOException {
    //set the input via GoraInputFormat
    GoraInputFormat.setInput(job, query, dataStore, reuseObjects);

    job.setMapperClass(mapperClass);
    job.setMapOutputKeyClass(outKeyClass);
    job.setMapOutputValueClass(outValueClass);

    if (partitionerClass != null) {
        job.setPartitionerClass(partitionerClass);
    }
}