Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:flink.applications.model.fraud.prepare.Projection.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Projection  and grouping  MR";
    job.setJobName(jobName);/*from   ww w.  java  2  s  . c  om*/

    job.setJarByClass(Projection.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    Utility.setConfiguration(job.getConfiguration());
    String operation = job.getConfiguration().get("projection.operation", "project");

    if (operation.startsWith("grouping")) {
        //group by
        job.setMapperClass(Projection.ProjectionMapper.class);
        job.setReducerClass(Projection.ProjectionReducer.class);

        job.setMapOutputKeyClass(Tuple.class);
        job.setMapOutputValueClass(Text.class);

        job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

        //order by
        boolean doOrderBy = job.getConfiguration().getInt("orderBy.field", -1) >= 0;
        if (doOrderBy) {
            job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class);
            job.setPartitionerClass(SecondarySort.TupleTextPartitioner.class);
        }

    } else {
        //simple projection
        job.setMapperClass(Projection.SimpleProjectionMapper.class);
    }

    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);

    int status = job.waitForCompletion(true) ? 0 : 1;
    return status;
}

From source file:gaffer.accumulo.bulkimport.BulkImportDriver.java

License:Apache License

public int run(String[] args) throws Exception {
    // Usage//from  w  w  w .  j av  a  2 s  .c  o m
    if (args.length < 3) {
        System.err.println("Usage: " + BulkImportDriver.class.getName()
                + " <inputpath> <output_path> <accumulo_properties_file>");
        return 1;
    }

    // Gets paths
    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1] + "/data_for_accumulo/");
    Path splitsFilePath = new Path(args[1] + "/splits_file");
    String accumuloPropertiesFile = args[2];

    // Hadoop configuration
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    // Connect to Accumulo
    AccumuloConfig accConf = new AccumuloConfig(accumuloPropertiesFile);
    Connector conn = Accumulo.connect(accConf);
    String tableName = accConf.getTable();

    // Check if the table exists
    if (!conn.tableOperations().exists(tableName)) {
        System.err.println("Table " + tableName + " does not exist - create the table before running this");
        return 1;
    }

    // Get the current splits from the table.
    // (This assumes that we have already created the table using <code>InitialiseTable</code>.)
    Collection<Text> splits = conn.tableOperations().getSplits(tableName);
    int numSplits = splits.size();
    System.out.println("Number of splits in table is " + numSplits);

    // Write current splits to a file (this is needed so that the following MapReduce
    // job can move them to the DistributedCache).
    IngestUtils.createSplitsFile(conn, tableName, fs, splitsFilePath);

    // Run MapReduce to output data suitable for bulk import to Accumulo
    // Conf and job
    conf.setBoolean("mapred.compress.map.output", true);
    conf.setClass("mapred.map.output.compression.codec", SnappyCodec.class, CompressionCodec.class);
    Job job = new Job(conf);
    job.setJarByClass(getClass());
    job.setJobName("Convert data to Accumulo format: input = " + inputPath + ", output = " + outputPath);

    // Input
    job.setInputFormatClass(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(job, inputPath);

    // Mapper
    job.setMapperClass(BulkImportMapper.class);
    job.setMapOutputKeyClass(Key.class);
    job.setMapOutputValueClass(Value.class);

    // Partitioner
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath.toString());

    // Reducer
    job.setReducerClass(BulkImportReducer.class);
    job.setOutputKeyClass(Key.class);
    job.setOutputValueClass(Value.class);
    job.setNumReduceTasks(numSplits + 1);

    // Output
    job.setOutputFormatClass(AccumuloFileOutputFormat.class);
    AccumuloFileOutputFormat.setOutputPath(job, outputPath);

    // Run job
    job.waitForCompletion(true);

    // Successful?
    if (!job.isSuccessful()) {
        System.err.println("Error running job");
        return 1;
    }

    return 0;
}

From source file:gaffer.accumulostore.operation.hdfs.handler.job.AccumuloAddElementsFromHdfsJobFactory.java

License:Apache License

private void setupPartioner(final Job job, final AddElementsFromHdfs operation, final AccumuloStore store)
        throws IOException {
    String splitsFilePath = operation.getOption(AccumuloStoreConstants.OPERATION_HDFS_SPLITS_FILE);
    int numReduceTasks;
    if (null == splitsFilePath || splitsFilePath.equals("")) {
        splitsFilePath = store.getProperties().getSplitsFilePath();
        try {//from w w w .  j av a2  s. co  m
            numReduceTasks = IngestUtils.createSplitsFile(store.getConnection(),
                    store.getProperties().getTable(), FileSystem.get(job.getConfiguration()),
                    new Path(splitsFilePath));
        } catch (final StoreException e) {
            throw new RuntimeException(e.getMessage(), e);
        }
    } else {
        numReduceTasks = IngestUtils.getNumSplits(FileSystem.get(job.getConfiguration()),
                new Path(splitsFilePath));
    }
    job.setNumReduceTasks(numReduceTasks + 1);
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath);
}

From source file:gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java

License:Apache License

private void setUpPartitionerGenerateSplitsFile(final Job job, final AddElementsFromHdfs operation,
        final AccumuloStore store) throws IOException {
    final String splitsFilePath = store.getProperties().getSplitsFilePath();
    LOGGER.info("Creating splits file in location {} from table {}", splitsFilePath,
            store.getProperties().getTable());
    final int maxReducers = intOptionIsValid(operation,
            AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS);
    final int minReducers = intOptionIsValid(operation,
            AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS);
    if (maxReducers != -1 && minReducers != -1) {
        if (minReducers > maxReducers) {
            LOGGER.error(// w ww  .  j a  v  a  2s  . c o  m
                    "Minimum number of reducers must be less than the maximum number of reducers: minimum was {} "
                            + "maximum was {}",
                    minReducers, maxReducers);
            throw new IOException(
                    "Minimum number of reducers must be less than the maximum number of reducers");
        }
    }
    int numSplits;
    try {
        if (maxReducers == -1) {
            numSplits = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(),
                    FileSystem.get(job.getConfiguration()), new Path(splitsFilePath));
        } else {
            numSplits = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(),
                    FileSystem.get(job.getConfiguration()), new Path(splitsFilePath), maxReducers - 1);
        }
    } catch (final StoreException e) {
        throw new RuntimeException(e.getMessage(), e);
    }
    int numReducers = numSplits + 1;
    LOGGER.info("Number of splits is {}; number of reducers is {}", numSplits, numReducers);
    // If neither min or max are specified then nothing to do; if max specified and min not then already taken care of.
    // If min is specified and the number of reducers is not greater than that then set the appropriate number of
    // subbins.
    if (minReducers != -1) {
        if (numReducers < minReducers) {
            LOGGER.info("Number of reducers is {} which is less than the specified minimum number of {}",
                    numReducers, minReducers);
            int factor = (minReducers / numReducers) + 1;
            LOGGER.info("Setting number of subbins on KeyRangePartitioner to {}", factor);
            KeyRangePartitioner.setNumSubBins(job, factor);
            numReducers = numReducers * factor;
            LOGGER.info("Number of reducers is {}", numReducers);
        }
    }
    job.setNumReduceTasks(numReducers);
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath);
}

From source file:gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java

License:Apache License

private void setUpPartitionerFromUserProvidedSplitsFile(final Job job, final AddElementsFromHdfs operation)
        throws IOException {
    final String splitsFilePath = operation.getOption(AccumuloStoreConstants.OPERATION_HDFS_SPLITS_FILE);
    if (intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS) != -1
            || intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS) != -1) {
        LOGGER.info("Using splits file provided by user {}, ignoring options {} and {}", splitsFilePath,
                AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS,
                AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS);
    } else {/* w w  w.  j a  v a  2 s  .c om*/
        LOGGER.info("Using splits file provided by user {}", splitsFilePath);
    }
    final int numSplits = IngestUtils.getNumSplits(FileSystem.get(job.getConfiguration()),
            new Path(splitsFilePath));
    job.setNumReduceTasks(numSplits + 1);
    job.setPartitionerClass(KeyRangePartitioner.class);
    KeyRangePartitioner.setSplitFile(job, splitsFilePath);
}

From source file:gr.ntua.ece.cslab.modissense.queries.clients.GeneralHotIntQueryClient.java

@Override
public void executeQuery() {
    try {//from ww w.  ja  v  a2  s .co m
        if (this.createIfNotExist()) { //table exists            
            Configuration conf = HBaseConfiguration.create();
            Job job = new Job(conf, "Non personalized hotness interest");
            job.setJarByClass(GeneralHotIntQueryClient.class);
            Scan scan = new Scan();
            scan.setCaching(10000);

            scan.setFilter(new ColumnRangeFilter(Bytes.toBytes(startTimestamp), true,
                    Bytes.toBytes(endTimestamp), true));
            TableMapReduceUtil.initTableMapperJob(this.srcTable, // table name in bytes
                    scan, // scanner to use
                    GeneralHotIntQueryMapper.class, // mapper class
                    LongWritable.class, // key class
                    HotnessInterestWritable.class, // value class
                    job); // job object

            TableMapReduceUtil.initTableReducerJob(this.targetTable, GeneralHotIntQueryReducer.class, job);
            job.setPartitionerClass(HashPartitioner.class);
            job.setCombinerClass(GeneralHotIntQueryCombiner.class);
            job.setNumReduceTasks(4);
            job.setOutputFormatClass(TableOutputFormat.class);

            job.waitForCompletion(true);
        }
        this.openConnection(targetTable);
    } catch (IOException | InterruptedException | ClassNotFoundException ex) {
        Logger.getLogger(GeneralHotIntQueryClient.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:gr.ntua.h2rdf.byteImport.HexastoreBulkImport.java

License:Open Source License

public Job createSubmittableJob(String[] args) {
    TABLE_NAME = args[1];/*from  w  ww . j  av a  2 s .  c  o  m*/
    Job job = null;
    try {
        Configuration conf = new Configuration();
        conf.addResource("hbase-default.xml");
        conf.addResource("hbase-site.xml");
        job = new Job(conf, NAME);
        job.setJarByClass(HexastoreBulkImport.class);
        job.setMapperClass(TotalOrderPrep.Map.class);
        job.setReducerClass(Reduce.class);//sampler.HamaReducer.class);
        job.setCombinerClass(Combiner.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(ImmutableBytesWritable.class);
        job.setPartitionerClass(TotalOrderPartitioner.class);
        //TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("/user/npapa/"+regions+"partitions/part-r-00000"));
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("partitions/part-r-00000"));
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(HFileOutputFormat.class);
        Path out = new Path("out");
        FileOutputFormat.setOutputPath(job, out);
        FileSystem fs;
        try {
            fs = FileSystem.get(conf);
            if (fs.exists(out)) {
                fs.delete(out, true);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        // c.addResource(new Path("/0/arcomemDB/hadoop-0.20.2-cdh3u3/conf/hbase-site.xml"));
        HBaseAdmin hadmin = new HBaseAdmin(conf);
        HTableDescriptor desc = new HTableDescriptor(TABLE_NAME + "_stats");
        HColumnDescriptor family = new HColumnDescriptor("size");
        desc.addFamily(family);
        conf.setInt("zookeeper.session.timeout", 600000);
        if (hadmin.tableExists(TABLE_NAME + "_stats")) {
            //hadmin.disableTable(TABLE_NAME+"_stats");
            //hadmin.deleteTable(TABLE_NAME+"_stats");
        } else {
            hadmin.createTable(desc);
        }

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        //job.getConfiguration().setInt("mapred.map.tasks", 18);
        job.getConfiguration().set("h2rdf.tableName", TABLE_NAME);
        job.getConfiguration().setInt("mapred.reduce.tasks", (int) TotalOrderPrep.regions);
        job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
        job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
        job.getConfiguration().setInt("io.sort.mb", 100);
        job.getConfiguration().setInt("io.file.buffer.size", 131072);
        job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1);
        //job.getConfiguration().setInt("hbase.hregion.max.filesize", 67108864);
        job.getConfiguration().setInt("hbase.hregion.max.filesize", 33554432);
        //job.getConfiguration().setInt("io.sort.mb", 100);

    } catch (IOException e2) {
        e2.printStackTrace();
    }

    return job;
}

From source file:gr.ntua.h2rdf.inputFormat.TableMapReduceUtil.java

License:Open Source License

/**
 * Use this before submitting a TableReduce job. It will
 * appropriately set up the JobConf.//from w  ww  . j a v  a2  s  .c o  m
 * 
 * @param table  The output table.
 * @param reducer  The reducer class to use.
 * @param job  The current job to adjust.
 * @param partitioner  Partitioner to use. Pass <code>null</code> to use 
 * default partitioner.
 * @throws IOException When determining the region count fails. 
 */
public static void initTableReducerJob(String table, Class<? extends TableReducer> reducer, Job job,
        Class partitioner) throws IOException {
    job.setOutputFormatClass(TableOutputFormat.class);
    if (reducer != null)
        job.setReducerClass(reducer);
    job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, table);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Writable.class);
    if (partitioner == HRegionPartitioner.class) {
        job.setPartitionerClass(HRegionPartitioner.class);
        HTable outputTable = new HTable(new HBaseConfiguration(job.getConfiguration()), table);
        int regions = outputTable.getRegionsInfo().size();
        if (job.getNumReduceTasks() > regions) {
            job.setNumReduceTasks(outputTable.getRegionsInfo().size());
        }
    } else if (partitioner != null) {
        job.setPartitionerClass(partitioner);
    }
}

From source file:gr.ntua.h2rdf.inputFormat2.TableMapReduceUtil.java

License:Open Source License

/**
 * Use this before submitting a TableReduce job. It will
 * appropriately set up the JobConf.//www . j a  v  a  2 s  . c  om
 *
 * @param table  The output table.
 * @param reducer  The reducer class to use.
 * @param job  The current job to adjust.  Make sure the passed job is
 * carrying all necessary HBase configuration.
 * @param partitioner  Partitioner to use. Pass <code>null</code> to use
 * default partitioner.
 * @param quorumAddress Distant cluster to write to; default is null for
 * output to the cluster that is designated in <code>hbase-site.xml</code>.
 * Set this String to the zookeeper ensemble of an alternate remote cluster
 * when you would have the reduce write a cluster that is other than the
 * default; e.g. copying tables between clusters, the source would be
 * designated by <code>hbase-site.xml</code> and this param would have the
 * ensemble address of the remote cluster.  The format to pass is particular.
 * Pass <code> &lt;hbase.zookeeper.quorum>:&lt;hbase.zookeeper.client.port>:&lt;zookeeper.znode.parent>
 * </code> such as <code>server,server2,server3:2181:/hbase</code>.
 * @param serverClass redefined hbase.regionserver.class
 * @param serverImpl redefined hbase.regionserver.impl
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 * @throws IOException When determining the region count fails.
 */
public static void initTableReducerJob(String table, Class<? extends TableReducer> reducer, Job job,
        Class partitioner, String quorumAddress, String serverClass, String serverImpl,
        boolean addDependencyJars) throws IOException {

    Configuration conf = job.getConfiguration();
    HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
    job.setOutputFormatClass(TableOutputFormat.class);
    if (reducer != null)
        job.setReducerClass(reducer);
    conf.set(TableOutputFormat.OUTPUT_TABLE, table);
    // If passed a quorum/ensemble address, pass it on to TableOutputFormat.
    if (quorumAddress != null) {
        // Calling this will validate the format
        ZKUtil.transformClusterKey(quorumAddress);
        conf.set(TableOutputFormat.QUORUM_ADDRESS, quorumAddress);
    }
    if (serverClass != null && serverImpl != null) {
        conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass);
        conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl);
    }
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(Writable.class);
    if (partitioner == HRegionPartitioner.class) {
        job.setPartitionerClass(HRegionPartitioner.class);
        HTable outputTable = new HTable(conf, table);
        int regions = outputTable.getRegionsInfo().size();
        if (job.getNumReduceTasks() > regions) {
            job.setNumReduceTasks(outputTable.getRegionsInfo().size());
        }
    } else if (partitioner != null) {
        job.setPartitionerClass(partitioner);
    }

    if (addDependencyJars) {
        addDependencyJars(job);
    }

    initCredentials(job);
}

From source file:gr.ntua.h2rdf.LoadTriples.DistinctIds.java

License:Open Source License

public Job createSubmittableJob(String[] args) throws IOException, ClassNotFoundException {
    //io.compression.codecs
    Job job = new Job();

    job.setInputFormatClass(TextInputFormat.class);
    Configuration conf = new Configuration();
    Path blockProjection = new Path("blockIds/");
    Path translations = new Path("translations/");
    Path sample = new Path("sample/");
    Path temp = new Path("temp/");
    Path uniqueIds = new Path("uniqueIds/");
    FileSystem fs;//w w  w  .  j  a v a  2 s .c  o m
    try {
        fs = FileSystem.get(conf);
        if (fs.exists(uniqueIds)) {
            fs.delete(uniqueIds, true);
        }
        if (fs.exists(translations)) {
            fs.delete(translations, true);
        }
        if (fs.exists(blockProjection)) {
            fs.delete(blockProjection, true);
        }
        if (fs.exists(sample)) {
            fs.delete(sample, true);
        }
        if (fs.exists(temp)) {
            fs.delete(temp, true);
        }

        FileOutputFormat.setOutputPath(job, uniqueIds);
        Path inp = new Path(args[0]);
        FileInputFormat.setInputPaths(job, inp);

        double type = 1;
        double datasetSize = 0;
        if (fs.isFile(inp)) {
            datasetSize = fs.getFileStatus(inp).getLen();
        } else if (fs.isDirectory(inp)) {
            FileStatus[] s = fs.listStatus(inp);
            for (int i = 0; i < s.length; i++) {
                if (s[i].getPath().getName().toString().endsWith(".gz"))
                    type = 27;
                if (s[i].getPath().getName().toString().endsWith(".snappy"))
                    type = 10;
                datasetSize += s[i].getLen();
            }
        } else {
            FileStatus[] s = fs.globStatus(inp);
            for (int i = 0; i < s.length; i++) {
                if (s[i].getPath().getName().toString().endsWith(".gz"))
                    type = 27;
                if (s[i].getPath().getName().toString().endsWith(".snappy"))
                    type = 10;
                datasetSize += s[i].getLen();
            }
        }
        datasetSize = datasetSize * type;
        System.out.println("type: " + type);
        System.out.println("datasetSize: " + datasetSize);
        samplingRate = (double) sampleChunk / (double) datasetSize;
        if (samplingRate >= 0.1) {
            samplingRate = 0.1;
        }
        if (samplingRate <= 0.001) {
            samplingRate = 0.001;
        }
        numReducers = (int) (datasetSize / ReducerChunk);
        if (numReducers == 0)
            numReducers = 1;
        numReducers++;
    } catch (IOException e) {
        e.printStackTrace();
    }

    HBaseAdmin hadmin = new HBaseAdmin(conf);
    HTableDescriptor desc = new HTableDescriptor(TABLE_NAME);

    HColumnDescriptor family = new HColumnDescriptor("counter");
    desc.addFamily(family);
    if (!hadmin.tableExists(TABLE_NAME)) {
        hadmin.createTable(desc);
    }

    job.setNumReduceTasks(numReducers);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(ImmutableBytesWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setJarByClass(DistinctIds.class);
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setPartitionerClass(SamplingPartitioner.class);

    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    job.getConfiguration().set("mapred.compress.map.output", "true");
    job.getConfiguration().set("mapred.map.output.compression.codec",
            "org.apache.hadoop.io.compress.SnappyCodec");

    //job.setCombinerClass(Combiner.class);
    job.setJobName("Distinct Id Wordcount");
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().setInt("io.sort.mb", 100);
    job.getConfiguration().setInt("io.file.buffer.size", 131072);
    job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1);

    return job;

}