Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:flink.applications.model.fraud.prepare.Projection.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Projection  and grouping  MR";
    job.setJobName(jobName);/*from   ww w.  java  2  s  . c  om*/

    job.setJarByClass(Projection.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration());
    String operation = job.getConfiguration().get("projection.operation", "project");

    if (operation.startsWith("grouping")) {
        //group by
        job.setMapperClass(Projection.ProjectionMapper.class);
        job.setReducerClass(Projection.ProjectionReducer.class);

        job.setMapOutputKeyClass(Tuple.class);
        job.setMapOutputValueClass(Text.class);

        job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

        //order by
        boolean doOrderBy = job.getConfiguration().getInt("orderBy.field", -1) >= 0;
        if (doOrderBy) {
            job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
            job.setPartitionerClass(SecondarySort.TupleTextPartitioner.class);
        }

    } else {
        //simple projection
        job.setMapperClass(Projection.SimpleProjectionMapper.class);
    }

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:gaffer.accumulo.bulkimport.BulkImportDriver.java

License:Apache License

public int run(String[] args) throws Exception {
    // Usage//from  w  w  w .  j av  a  2 s  .c  o m
    if (args.length < 3) {
        System.err.println("Usage: " + BulkImportDriver.class.getName()
                + " <inputpath> <output_path> <accumulo_properties_file>");
        return 1;
    }

    // Gets paths
    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1] + "/data_for_accumulo/");
    Path splitsFilePath = new Path(args[1] + "/splits_file");
    String accumuloPropertiesFile = args[2];

    // Hadoop configuration
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Connect to Accumulo
    AccumuloConfig accConf = new AccumuloConfig(accumuloPropertiesFile);
    Connector conn = Accumulo.connect(accConf);
    String tableName = accConf.getTable();

    // Check if the table exists
    if (!conn.tableOperations().exists(tableName)) {
        System.err.println("Table " + tableName + " does not exist - create the table before running this");
        return 1;
    }

    // Get the current splits from the table.
    // (This assumes that we have already created the table using <code>InitialiseTable</code>.)
    Collection<Text> splits = conn.tableOperations().getSplits(tableName);
    int numSplits = splits.size();
    System.out.println("Number of splits in table is " + numSplits);

    // Write current splits to a file (this is needed so that the following MapReduce
    // job can move them to the DistributedCache).
    IngestUtils.createSplitsFile(conn, tableName, fs, splitsFilePath);

    // Run MapReduce to output data suitable for bulk import to Accumulo
    // Conf and job
    conf.setBoolean("mapred.compress.map.output", true);
    conf.setClass("mapred.map.output.compression.codec", SnappyCodec.class, CompressionCodec.class);
    Job job = new Job(conf);
    job.setJarByClass(getClass());
    job.setJobName("Convert data to Accumulo format: input = " + inputPath + ", output = " + outputPath);

    // Input
    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, inputPath);

    // Mapper
    job.setMapperClass(BulkImportMapper.class);
    job.setMapOutputKeyClass(Key.class);
    job.setMapOutputValueClass(Value.class);

    // Partitioner
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath.toString());

    // Reducer
    job.setReducerClass(BulkImportReducer.class);
    job.setOutputKeyClass(Key.class);
    job.setOutputValueClass(Value.class);
    job.setNumReduceTasks(numSplits + 1);

    // Output
    job.setOutputFormatClass(AccumuloFileOutputFormat.class);
    AccumuloFileOutputFormat.setOutputPath(job, outputPath);

    // Run job
    job.waitForCompletion(true);

    // Successful?
    if (!job.isSuccessful()) {
        System.err.println("Error running job");
        return 1;
    }

    return 0;
}

From source file:gaffer.accumulostore.operation.hdfs.handler.job.AccumuloAddElementsFromHdfsJobFactory.java

License:Apache License

private void setupPartioner(final Job job, final AddElementsFromHdfs operation, final AccumuloStore store)
        throws IOException {
    String splitsFilePath = operation.getOption(AccumuloStoreConstants.OPERATION_HDFS_SPLITS_FILE);
    int numReduceTasks;
    if (null == splitsFilePath || splitsFilePath.equals("")) {
        splitsFilePath = store.getProperties().getSplitsFilePath();
        try {//from w w w .  j av a2  s. co  m
            numReduceTasks = IngestUtils.createSplitsFile(store.getConnection(),
                    store.getProperties().getTable(), FileSystem.get(job.getConfiguration()),
                    new Path(splitsFilePath));
        } catch (final StoreException e) {
            throw new RuntimeException(e.getMessage(), e);
        }
    } else {
        numReduceTasks = IngestUtils.getNumSplits(FileSystem.get(job.getConfiguration()),
                new Path(splitsFilePath));
    }
    job.setNumReduceTasks(numReduceTasks + 1);
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath);
}

From source file:gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java

License:Apache License

private void setUpPartitionerGenerateSplitsFile(final Job job, final AddElementsFromHdfs operation,
        final AccumuloStore store) throws IOException {
    final String splitsFilePath = store.getProperties().getSplitsFilePath();
    LOGGER.info("Creating splits file in location {} from table {}", splitsFilePath,
            store.getProperties().getTable());
    final int maxReducers = intOptionIsValid(operation,
            AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS);
    final int minReducers = intOptionIsValid(operation,
            AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS);
    if (maxReducers != -1 && minReducers != -1) {
        if (minReducers > maxReducers) {
            LOGGER.error(// w ww  .  j a  v  a  2s  . c o  m
                    "Minimum number of reducers must be less than the maximum number of reducers: minimum was {} "
                            + "maximum was {}",
                    minReducers, maxReducers);
            throw new IOException(
                    "Minimum number of reducers must be less than the maximum number of reducers");
        }
    }
    int numSplits;
    try {
        if (maxReducers == -1) {
            numSplits = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(),
                    FileSystem.get(job.getConfiguration()), new Path(splitsFilePath));
        } else {
            numSplits = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(),
                    FileSystem.get(job.getConfiguration()), new Path(splitsFilePath), maxReducers - 1);
        }
    } catch (final StoreException e) {
        throw new RuntimeException(e.getMessage(), e);
    }
    int numReducers = numSplits + 1;
    LOGGER.info("Number of splits is {}; number of reducers is {}", numSplits, numReducers);
    // If neither min or max are specified then nothing to do; if max specified and min not then already taken care of.
    // If min is specified and the number of reducers is not greater than that then set the appropriate number of
    // subbins.
    if (minReducers != -1) {
        if (numReducers < minReducers) {
            LOGGER.info("Number of reducers is {} which is less than the specified minimum number of {}",
                    numReducers, minReducers);
            int factor = (minReducers / numReducers) + 1;
            LOGGER.info("Setting number of subbins on KeyRangePartitioner to {}", factor);
            KeyRangePartitioner.setNumSubBins(job, factor);
            numReducers = numReducers * factor;
            LOGGER.info("Number of reducers is {}", numReducers);
        }
    }
    job.setNumReduceTasks(numReducers);
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath);
}

From source file:gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java

License:Apache License

private void setUpPartitionerFromUserProvidedSplitsFile(final Job job, final AddElementsFromHdfs operation)
        throws IOException {
    final String splitsFilePath = operation.getOption(AccumuloStoreConstants.OPERATION_HDFS_SPLITS_FILE);
    if (intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS) != -1
            || intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS) != -1) {
        LOGGER.info("Using splits file provided by user {}, ignoring options {} and {}", splitsFilePath,
                AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS,
                AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS);
    } else {/* w w  w.  j a  v a  2 s  .c om*/
        LOGGER.info("Using splits file provided by user {}", splitsFilePath);
    }
    final int numSplits = IngestUtils.getNumSplits(FileSystem.get(job.getConfiguration()),
            new Path(splitsFilePath));
    job.setNumReduceTasks(numSplits + 1);
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath);
}

From source file:gr.ntua.ece.cslab.modissense.queries.clients.GeneralHotIntQueryClient.java

@Override
public void executeQuery() {
    try {//from ww w.  ja  v  a2  s .co m
        if (this.createIfNotExist()) { //table exists            
            Configuration conf = HBaseConfiguration.create();
            Job job = new Job(conf, "Non personalized hotness interest");
            job.setJarByClass(GeneralHotIntQueryClient.class);
            Scan scan = new Scan();
            scan.setCaching(10000);

            scan.setFilter(new ColumnRangeFilter(Bytes.toBytes(startTimestamp), true,
                    Bytes.toBytes(endTimestamp), true));
            TableMapReduceUtil.initTableMapperJob(this.srcTable, // table name in bytes
                    scan, // scanner to use
                    GeneralHotIntQueryMapper.class, // mapper class
                    LongWritable.class, // key class
                    HotnessInterestWritable.class, // value class
                    job); // job object

            TableMapReduceUtil.initTableReducerJob(this.targetTable, GeneralHotIntQueryReducer.class, job);
            job.setPartitionerClass(HashPartitioner.class);
            job.setCombinerClass(GeneralHotIntQueryCombiner.class);
            job.setNumReduceTasks(4);
            job.setOutputFormatClass(TableOutputFormat.class);

            job.waitForCompletion(true);
        }
        this.openConnection(targetTable);
    } catch (IOException | InterruptedException | ClassNotFoundException ex) {
        Logger.getLogger(GeneralHotIntQueryClient.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:gr.ntua.h2rdf.byteImport.HexastoreBulkImport.java

License:Open Source License

public Job createSubmittableJob(String[] args) {
    TABLE_NAME = args[1];/*from  w  ww . j  av a  2 s .  c  o  m*/
    Job job = null;
    try {
        Configuration conf = new Configuration();
        conf.addResource("hbase-default.xml");
        conf.addResource("hbase-site.xml");
        job = new Job(conf, NAME);
        job.setJarByClass(HexastoreBulkImport.class);
        job.setMapperClass(TotalOrderPrep.Map.class);
        job.setReducerClass(Reduce.class);//sampler.HamaReducer.class);
        job.setCombinerClass(Combiner.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(ImmutableBytesWritable.class);
        job.setPartitionerClass(TotalOrderPartitioner.class);
        //TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("/user/npapa/"+regions+"partitions/part-r-00000"));
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("partitions/part-r-00000"));
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(HFileOutputFormat.class);
        Path out = new Path("out");
        FileOutputFormat.setOutputPath(job, out);
        FileSystem fs;
        try {
            fs = FileSystem.get(conf);
            if (fs.exists(out)) {
                fs.delete(out, true);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        // c.addResource(new Path("/0/arcomemDB/hadoop-0.20.2-cdh3u3/conf/hbase-site.xml"));
        HBaseAdmin hadmin = new HBaseAdmin(conf);
        HTableDescriptor desc = new HTableDescriptor(TABLE_NAME + "_stats");
        HColumnDescriptor family = new HColumnDescriptor("size");
        desc.addFamily(family);
        conf.setInt("zookeeper.session.timeout", 600000);
        if (hadmin.tableExists(TABLE_NAME + "_stats")) {
            //hadmin.disableTable(TABLE_NAME+"_stats");
            //hadmin.deleteTable(TABLE_NAME+"_stats");
        } else {
            hadmin.createTable(desc);
        }

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        //job.getConfiguration().setInt("mapred.map.tasks", 18);
        job.getConfiguration().set("h2rdf.tableName", TABLE_NAME);
        job.getConfiguration().setInt("mapred.reduce.tasks", (int) TotalOrderPrep.regions);
        job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
        job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
        job.getConfiguration().setInt("io.sort.mb", 100);
        job.getConfiguration().setInt("io.file.buffer.size", 131072);
        job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1);
        //job.getConfiguration().setInt("hbase.hregion.max.filesize", 67108864);
        job.getConfiguration().setInt("hbase.hregion.max.filesize", 33554432);
        //job.getConfiguration().setInt("io.sort.mb", 100);

    } catch (IOException e2) {
        e2.printStackTrace();
    }

    return job;
}

From source file:gr.ntua.h2rdf.inputFormat.TableMapReduceUtil.java

License:Open Source License

/**
 * Use this before submitting a TableReduce job. It will
 * appropriately set up the JobConf.//from w  ww  . j a v  a2  s  .c o  m
 * 
 * @param table  The output table.
 * @param reducer  The reducer class to use.
 * @param job  The current job to adjust.
 * @param partitioner  Partitioner to use. Pass <code>null</code> to use 
 * default partitioner.
 * @throws IOException When determining the region count fails. 
 */
public static void initTableReducerJob(String table, Class<? extends TableReducer> reducer, Job job,
        Class partitioner) throws IOException {
    job.setOutputFormatClass(TableOutputFormat.class);
    if (reducer != null)
        job.setReducerClass(reducer);
    job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, table);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Writable.class);
    if (partitioner == HRegionPartitioner.class) {
        job.setPartitionerClass(HRegionPartitioner.class);
        HTable outputTable = new HTable(new HBaseConfiguration(job.getConfiguration()), table);
        int regions = outputTable.getRegionsInfo().size();
        if (job.getNumReduceTasks() > regions) {
            job.setNumReduceTasks(outputTable.getRegionsInfo().size());
        }
    } else if (partitioner != null) {
        job.setPartitionerClass(partitioner);
    }
}

From source file:gr.ntua.h2rdf.inputFormat2.TableMapReduceUtil.java

License:Open Source License

/**
 * Use this before submitting a TableReduce job. It will
 * appropriately set up the JobConf.//www . j a  v  a  2 s  . c  om
 *
 * @param table  The output table.
 * @param reducer  The reducer class to use.
 * @param job  The current job to adjust.  Make sure the passed job is
 * carrying all necessary HBase configuration.
 * @param partitioner  Partitioner to use. Pass <code>null</code> to use
 * default partitioner.
 * @param quorumAddress Distant cluster to write to; default is null for
 * output to the cluster that is designated in <code>hbase-site.xml</code>.
 * Set this String to the zookeeper ensemble of an alternate remote cluster
 * when you would have the reduce write a cluster that is other than the
 * default; e.g. copying tables between clusters, the source would be
 * designated by <code>hbase-site.xml</code> and this param would have the
 * ensemble address of the remote cluster.  The format to pass is particular.
 * Pass <code> &lt;hbase.zookeeper.quorum>:&lt;hbase.zookeeper.client.port>:&lt;zookeeper.znode.parent>
 * </code> such as <code>server,server2,server3:2181:/hbase</code>.
 * @param serverClass redefined hbase.regionserver.class
 * @param serverImpl redefined hbase.regionserver.impl
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 * @throws IOException When determining the region count fails.
 */
public static void initTableReducerJob(String table, Class<? extends TableReducer> reducer, Job job,
        Class partitioner, String quorumAddress, String serverClass, String serverImpl,
        boolean addDependencyJars) throws IOException {

    Configuration conf = job.getConfiguration();
    HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
    job.setOutputFormatClass(TableOutputFormat.class);
    if (reducer != null)
        job.setReducerClass(reducer);
    conf.set(TableOutputFormat.OUTPUT_TABLE, table);
    // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
    if (quorumAddress != null) {
        // Calling this will validate the format
        ZKUtil.transformClusterKey(quorumAddress);
        conf.set(TableOutputFormat.QUORUM_ADDRESS, quorumAddress);
    }
    if (serverClass != null && serverImpl != null) {
        conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
        conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
    }
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Writable.class);
    if (partitioner == HRegionPartitioner.class) {
        job.setPartitionerClass(HRegionPartitioner.class);
        HTable outputTable = new HTable(conf, table);
        int regions = outputTable.getRegionsInfo().size();
        if (job.getNumReduceTasks() > regions) {
            job.setNumReduceTasks(outputTable.getRegionsInfo().size());
        }
    } else if (partitioner != null) {
        job.setPartitionerClass(partitioner);
    }

    if (addDependencyJars) {
        addDependencyJars(job);
    }

    initCredentials(job);
}

From source file:gr.ntua.h2rdf.LoadTriples.DistinctIds.java

License:Open Source License

public Job createSubmittableJob(String[] args) throws IOException, ClassNotFoundException {
    //io.compression.codecs
    Job job = new Job();

    job.setInputFormatClass(TextInputFormat.class);
    Configuration conf = new Configuration();
    Path blockProjection = new Path("blockIds/");
    Path translations = new Path("translations/");
    Path sample = new Path("sample/");
    Path temp = new Path("temp/");
    Path uniqueIds = new Path("uniqueIds/");
    FileSystem fs;//w w  w  .  j  a v a  2 s .c  o m
    try {
        fs = FileSystem.get(conf);
        if (fs.exists(uniqueIds)) {
            fs.delete(uniqueIds, true);
        }
        if (fs.exists(translations)) {
            fs.delete(translations, true);
        }
        if (fs.exists(blockProjection)) {
            fs.delete(blockProjection, true);
        }
        if (fs.exists(sample)) {
            fs.delete(sample, true);
        }
        if (fs.exists(temp)) {
            fs.delete(temp, true);
        }

        FileOutputFormat.setOutputPath(job, uniqueIds);
        Path inp = new Path(args[0]);
        FileInputFormat.setInputPaths(job, inp);

        double type = 1;
        double datasetSize = 0;
        if (fs.isFile(inp)) {
            datasetSize = fs.getFileStatus(inp).getLen();
        } else if (fs.isDirectory(inp)) {
            FileStatus[] s = fs.listStatus(inp);
            for (int i = 0; i < s.length; i++) {
                if (s[i].getPath().getName().toString().endsWith(".gz"))
                    type = 27;
                if (s[i].getPath().getName().toString().endsWith(".snappy"))
                    type = 10;
                datasetSize += s[i].getLen();
            }
        } else {
            FileStatus[] s = fs.globStatus(inp);
            for (int i = 0; i < s.length; i++) {
                if (s[i].getPath().getName().toString().endsWith(".gz"))
                    type = 27;
                if (s[i].getPath().getName().toString().endsWith(".snappy"))
                    type = 10;
                datasetSize += s[i].getLen();
            }
        }
        datasetSize = datasetSize * type;
        System.out.println("type: " + type);
        System.out.println("datasetSize: " + datasetSize);
        samplingRate = (double) sampleChunk / (double) datasetSize;
        if (samplingRate >= 0.1) {
            samplingRate = 0.1;
        }
        if (samplingRate <= 0.001) {
            samplingRate = 0.001;
        }
        numReducers = (int) (datasetSize / ReducerChunk);
        if (numReducers == 0)
            numReducers = 1;
        numReducers++;
    } catch (IOException e) {
        e.printStackTrace();
    }

    HBaseAdmin hadmin = new HBaseAdmin(conf);
    HTableDescriptor desc = new HTableDescriptor(TABLE_NAME);

    HColumnDescriptor family = new HColumnDescriptor("counter");
    desc.addFamily(family);
    if (!hadmin.tableExists(TABLE_NAME)) {
        hadmin.createTable(desc);
    }

    job.setNumReduceTasks(numReducers);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(ImmutableBytesWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setJarByClass(DistinctIds.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setPartitionerClass(SamplingPartitioner.class);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    job.getConfiguration().set("mapred.compress.map.output", "true");
    job.getConfiguration().set("mapred.map.output.compression.codec",
            "org.apache.hadoop.io.compress.SnappyCodec");

    //job.setCombinerClass(Combiner.class);
    job.setJobName("Distinct Id Wordcount");
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().setInt("io.sort.mb", 100);
    job.getConfiguration().setInt("io.file.buffer.size", 131072);
    job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1);

    return job;

}