Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:org.apache.accumulo.examples.mapreduce.bulk.BulkIngestExample.java

License:Apache License

@Override
public int run(String[] args) {
    Opts opts = new Opts();
    opts.parseArgs(BulkIngestExample.class.getName(), args);

    Configuration conf = getConf();
    PrintStream out = null;//from   w  w  w .  ja  v  a 2  s. c  o m
    try {
        Job job = Job.getInstance(conf);
        job.setJobName("bulk ingest example");
        job.setJarByClass(this.getClass());

        job.setInputFormatClass(TextInputFormat.class);

        job.setMapperClass(MapClass.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(ReduceClass.class);
        job.setOutputFormatClass(AccumuloFileOutputFormat.class);
        opts.setAccumuloConfigs(job);

        Connector connector = opts.getConnector();

        TextInputFormat.setInputPaths(job, new Path(opts.inputDir));
        AccumuloFileOutputFormat.setOutputPath(job, new Path(opts.workDir + "/files"));

        FileSystem fs = FileSystem.get(conf);
        out = new PrintStream(new BufferedOutputStream(fs.create(new Path(opts.workDir + "/splits.txt"))));

        Collection<Text> splits = connector.tableOperations().listSplits(opts.getTableName(), 100);
        for (Text split : splits)
            out.println(Base64.getEncoder().encodeToString(TextUtil.getBytes(split)));

        job.setNumReduceTasks(splits.size() + 1);
        out.close();

        job.setPartitionerClass(RangePartitioner.class);
        RangePartitioner.setSplitFile(job, opts.workDir + "/splits.txt");

        job.waitForCompletion(true);
        Path failures = new Path(opts.workDir, "failures");
        fs.delete(failures, true);
        fs.mkdirs(new Path(opts.workDir, "failures"));
        // With HDFS permissions on, we need to make sure the Accumulo user can read/move the rfiles
        FsShell fsShell = new FsShell(conf);
        fsShell.run(new String[] { "-chmod", "-R", "777", opts.workDir });
        connector.tableOperations().importDirectory(opts.getTableName(), opts.workDir + "/files",
                opts.workDir + "/failures", false);

    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        if (out != null)
            out.close();
    }

    return 0;
}

From source file:org.apache.accumulo.examples.simple.mapreduce.bulk.BulkIngestExample.java

License:Apache License

public int run(String[] args) {
    if (args.length != 7) {
        System.out.println("ERROR: Wrong number of parameters: " + args.length + " instead of 7.");
        return printUsage();
    }/*ww  w  .  j a va2 s .c  o m*/

    Configuration conf = getConf();
    PrintStream out = null;
    try {
        Job job = new Job(conf, "bulk ingest example");
        job.setJarByClass(this.getClass());

        job.setInputFormatClass(TextInputFormat.class);

        job.setMapperClass(MapClass.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(ReduceClass.class);
        job.setOutputFormatClass(AccumuloFileOutputFormat.class);

        Instance instance = new ZooKeeperInstance(args[0], args[1]);
        String user = args[2];
        byte[] pass = args[3].getBytes();
        String tableName = args[4];
        String inputDir = args[5];
        String workDir = args[6];

        Connector connector = instance.getConnector(user, pass);

        TextInputFormat.setInputPaths(job, new Path(inputDir));
        AccumuloFileOutputFormat.setOutputPath(job, new Path(workDir + "/files"));

        FileSystem fs = FileSystem.get(conf);
        out = new PrintStream(new BufferedOutputStream(fs.create(new Path(workDir + "/splits.txt"))));

        Collection<Text> splits = connector.tableOperations().getSplits(tableName, 100);
        for (Text split : splits)
            out.println(new String(Base64.encodeBase64(TextUtil.getBytes(split))));

        job.setNumReduceTasks(splits.size() + 1);
        out.close();

        job.setPartitionerClass(RangePartitioner.class);
        RangePartitioner.setSplitFile(job, workDir + "/splits.txt");

        job.waitForCompletion(true);
        Path failures = new Path(workDir, "failures");
        fs.delete(failures, true);
        fs.mkdirs(new Path(workDir, "failures"));
        connector.tableOperations().importDirectory(tableName, workDir + "/files", workDir + "/failures",
                false);

    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        if (out != null)
            out.close();
    }

    return 0;
}

From source file:org.apache.accumulo.examples.simple.mapreduce.bulk.BulkIngestExample.java

License:Apache License

@Override
public int run(String[] args) {
    Opts opts = new Opts();
    opts.parseArgs(BulkIngestExample.class.getName(), args);

    Configuration conf = getConf();
    PrintStream out = null;/*from  ww w . j  a v  a2s.c om*/
    try {
        Job job = JobUtil.getJob(conf);
        job.setJobName("bulk ingest example");
        job.setJarByClass(this.getClass());

        job.setInputFormatClass(TextInputFormat.class);

        job.setMapperClass(MapClass.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(ReduceClass.class);
        job.setOutputFormatClass(AccumuloFileOutputFormat.class);
        opts.setAccumuloConfigs(job);

        Connector connector = opts.getConnector();

        TextInputFormat.setInputPaths(job, new Path(opts.inputDir));
        AccumuloFileOutputFormat.setOutputPath(job, new Path(opts.workDir + "/files"));

        FileSystem fs = FileSystem.get(conf);
        out = new PrintStream(new BufferedOutputStream(fs.create(new Path(opts.workDir + "/splits.txt"))));

        Collection<Text> splits = connector.tableOperations().listSplits(opts.getTableName(), 100);
        for (Text split : splits)
            out.println(Base64.encodeBase64String(TextUtil.getBytes(split)));

        job.setNumReduceTasks(splits.size() + 1);
        out.close();

        job.setPartitionerClass(RangePartitioner.class);
        RangePartitioner.setSplitFile(job, opts.workDir + "/splits.txt");

        job.waitForCompletion(true);
        Path failures = new Path(opts.workDir, "failures");
        fs.delete(failures, true);
        fs.mkdirs(new Path(opts.workDir, "failures"));
        connector.tableOperations().importDirectory(opts.getTableName(), opts.workDir + "/files",
                opts.workDir + "/failures", false);

    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        if (out != null)
            out.close();
    }

    return 0;
}

From source file:org.apache.accumulo.server.test.randomwalk.shard.SortTool.java

License:Apache License

public int run(String[] args) throws Exception {
    Job job = new Job(getConf(), this.getClass().getSimpleName());
    job.setJarByClass(this.getClass());

    if (job.getJar() == null) {
        log.error("M/R requires a jar file!  Run mvn package.");
        return 1;
    }//from  ww w. jav  a  2  s  . co  m

    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.setInputPaths(job, seqFile);

    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitFile);

    job.setMapOutputKeyClass(Key.class);
    job.setMapOutputValueClass(Value.class);

    job.setNumReduceTasks(splits.size() + 1);

    job.setOutputFormatClass(AccumuloFileOutputFormat.class);
    AccumuloFileOutputFormat.setOutputPath(job, new Path(outputDir));

    job.waitForCompletion(true);
    return job.isSuccessful() ? 0 : 1;
}

From source file:org.apache.accumulo.test.randomwalk.shard.SortTool.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf(), this.getClass().getSimpleName());
    job.setJarByClass(this.getClass());

    if (job.getJar() == null) {
        log.error("M/R requires a jar file!  Run mvn package.");
        return 1;
    }/*from   w  w  w.  jav  a  2 s  . c  om*/

    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.setInputPaths(job, seqFile);

    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitFile);

    job.setMapOutputKeyClass(Key.class);
    job.setMapOutputValueClass(Value.class);

    job.setNumReduceTasks(splits.size() + 1);

    job.setOutputFormatClass(AccumuloFileOutputFormat.class);
    AccumuloFileOutputFormat.setOutputPath(job, new Path(outputDir));

    job.waitForCompletion(true);
    return job.isSuccessful() ? 0 : 1;
}

From source file:org.apache.blur.mapreduce.lib.update.Driver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    int c = 0;/*from  ww  w  . ja  va2s .  c  o  m*/
    if (args.length < 5) {
        System.err.println(
                "Usage Driver <table> <mr inc working path> <output path> <zk connection> <reducer multipler> <extra config files...>");
    }
    String table = args[c++];
    String mrIncWorkingPathStr = args[c++];
    String outputPathStr = args[c++];
    String blurZkConnection = args[c++];
    int reducerMultipler = Integer.parseInt(args[c++]);
    for (; c < args.length; c++) {
        String externalConfigFileToAdd = args[c];
        getConf().addResource(new Path(externalConfigFileToAdd));
    }

    Path outputPath = new Path(outputPathStr);
    Path mrIncWorkingPath = new Path(mrIncWorkingPathStr);
    FileSystem fileSystem = mrIncWorkingPath.getFileSystem(getConf());

    Path newData = new Path(mrIncWorkingPath, NEW);
    Path inprogressData = new Path(mrIncWorkingPath, INPROGRESS);
    Path completeData = new Path(mrIncWorkingPath, COMPLETE);
    Path fileCache = new Path(mrIncWorkingPath, CACHE);

    fileSystem.mkdirs(newData);
    fileSystem.mkdirs(inprogressData);
    fileSystem.mkdirs(completeData);
    fileSystem.mkdirs(fileCache);

    List<Path> srcPathList = new ArrayList<Path>();
    for (FileStatus fileStatus : fileSystem.listStatus(newData)) {
        srcPathList.add(fileStatus.getPath());
    }
    if (srcPathList.isEmpty()) {
        return 0;
    }

    List<Path> inprogressPathList = new ArrayList<Path>();
    boolean success = false;
    Iface client = null;
    try {
        inprogressPathList = movePathList(fileSystem, inprogressData, srcPathList);

        Job job = Job.getInstance(getConf(), "Blur Row Updater for table [" + table + "]");
        client = BlurClient.getClientFromZooKeeperConnectionStr(blurZkConnection);
        waitForOtherSnapshotsToBeRemoved(client, table, MRUPDATE_SNAPSHOT);
        client.createSnapshot(table, MRUPDATE_SNAPSHOT);
        TableDescriptor descriptor = client.describe(table);
        Path tablePath = new Path(descriptor.getTableUri());

        BlurInputFormat.setLocalCachePath(job, fileCache);
        BlurInputFormat.addTable(job, descriptor, MRUPDATE_SNAPSHOT);
        MultipleInputs.addInputPath(job, tablePath, BlurInputFormat.class, MapperForExistingData.class);
        for (Path p : inprogressPathList) {
            FileInputFormat.addInputPath(job, p);
            MultipleInputs.addInputPath(job, p, SequenceFileInputFormat.class, MapperForNewData.class);
        }

        BlurOutputFormat.setOutputPath(job, outputPath);
        BlurOutputFormat.setupJob(job, descriptor);

        job.setReducerClass(UpdateReducer.class);
        job.setMapOutputKeyClass(IndexKey.class);
        job.setMapOutputValueClass(IndexValue.class);
        job.setPartitionerClass(IndexKeyPartitioner.class);
        job.setGroupingComparatorClass(IndexKeyWritableComparator.class);

        BlurOutputFormat.setReducerMultiplier(job, reducerMultipler);

        success = job.waitForCompletion(true);
        Counters counters = job.getCounters();
        LOG.info("Counters [" + counters + "]");

    } finally {
        if (success) {
            LOG.info("Indexing job succeeded!");
            movePathList(fileSystem, completeData, inprogressPathList);
        } else {
            LOG.error("Indexing job failed!");
            movePathList(fileSystem, newData, inprogressPathList);
        }
        if (client != null) {
            client.removeSnapshot(table, MRUPDATE_SNAPSHOT);
        }
    }

    if (success) {
        return 0;
    } else {
        return 1;
    }

}

From source file:org.apache.crunch.GroupingOptions.java

License:Apache License

public void configure(Job job) {
    if (partitionerClass != null) {
        job.setPartitionerClass(partitionerClass);
    }/*from www. ja  v  a 2  s  .c  o m*/
    if (groupingComparatorClass != null) {
        job.setGroupingComparatorClass(groupingComparatorClass);
    }
    if (sortComparatorClass != null) {
        job.setSortComparatorClass(sortComparatorClass);
    }
    if (numReducers > 0) {
        job.setNumReduceTasks(numReducers);
    }
    for (Map.Entry<String, String> e : extraConf.entrySet()) {
        job.getConfiguration().set(e.getKey(), e.getValue());
    }
}

From source file:org.apache.druid.indexer.SortableBytes.java

License:Apache License

public static void useSortableBytesAsMapOutputKey(Job job, Class<? extends Partitioner> partitionerClass) {
    job.setMapOutputKeyClass(BytesWritable.class);
    job.setGroupingComparatorClass(SortableBytesGroupingComparator.class);
    job.setSortComparatorClass(SortableBytesSortingComparator.class);
    job.setPartitionerClass(partitionerClass);
}

From source file:org.apache.gora.mapreduce.GoraMapper.java

License:Apache License

/**
 * Initializes the Mapper, and sets input parameters for the job. All of 
 * the records in the dataStore are used as the input. If you want to 
 * include a specific subset, use one of the overloaded methods which takes
 * query parameter.//w  ww.jav a  2  s  .  co m
 * @param job the job to set the properties for
 * @param dataStoreClass the datastore class
 * @param inKeyClass Map input key class
 * @param inValueClass Map input value class
 * @param outKeyClass Map output key class
 * @param outValueClass Map output value class
 * @param mapperClass the mapper class extending GoraMapper
 * @param partitionerClass optional partitioner class
 * @param reuseObjects whether to reuse objects in serialization
 */
@SuppressWarnings("rawtypes")
public static <K1, V1 extends Persistent, K2, V2> void initMapperJob(Job job,
        Class<? extends DataStore<K1, V1>> dataStoreClass, Class<K1> inKeyClass, Class<V1> inValueClass,
        Class<K2> outKeyClass, Class<V2> outValueClass, Class<? extends GoraMapper> mapperClass,
        Class<? extends Partitioner> partitionerClass, boolean reuseObjects) throws IOException {

    //set the input via GoraInputFormat
    GoraInputFormat.setInput(job, dataStoreClass, inKeyClass, inValueClass, reuseObjects);

    job.setMapperClass(mapperClass);
    job.setMapOutputKeyClass(outKeyClass);
    job.setMapOutputValueClass(outValueClass);

    if (partitionerClass != null) {
        job.setPartitionerClass(partitionerClass);
    }
}

From source file:org.apache.gora.mapreduce.GoraMapper.java

License:Apache License

/**
 * Initializes the Mapper, and sets input parameters for the job
 * @param job the job to set the properties for
 * @param query the query to get the inputs from
 * @param dataStore the datastore as the input
 * @param outKeyClass Map output key class
 * @param outValueClass Map output value class
 * @param mapperClass the mapper class extending GoraMapper
 * @param partitionerClass optional partitioner class
 * @param reuseObjects whether to reuse objects in serialization
 *//*from  w  ww  .  j a v  a 2s.co m*/
@SuppressWarnings("rawtypes")
public static <K1, V1 extends Persistent, K2, V2> void initMapperJob(Job job, Query<K1, V1> query,
        DataStore<K1, V1> dataStore, Class<K2> outKeyClass, Class<V2> outValueClass,
        Class<? extends GoraMapper> mapperClass, Class<? extends Partitioner> partitionerClass,
        boolean reuseObjects) throws IOException {
    //set the input via GoraInputFormat
    GoraInputFormat.setInput(job, query, dataStore, reuseObjects);

    job.setMapperClass(mapperClass);
    job.setMapOutputKeyClass(outKeyClass);
    job.setMapOutputValueClass(outValueClass);

    if (partitionerClass != null) {
        job.setPartitionerClass(partitionerClass);
    }
}