List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:flink.applications.model.fraud.prepare.Projection.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = new Job(getConf()); String jobName = "Projection and grouping MR"; job.setJobName(jobName);/*from ww w. java 2 s . c om*/ job.setJarByClass(Projection.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); Utility.setConfiguration(job.getConfiguration()); String operation = job.getConfiguration().get("projection.operation", "project"); if (operation.startsWith("grouping")) { //group by job.setMapperClass(Projection.ProjectionMapper.class); job.setReducerClass(Projection.ProjectionReducer.class); job.setMapOutputKeyClass(Tuple.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1)); //order by boolean doOrderBy = job.getConfiguration().getInt("orderBy.field", -1) >= 0; if (doOrderBy) { job.setGroupingComparatorClass(SecondarySort.TuplePairGroupComprator.class); job.setPartitionerClass(SecondarySort.TupleTextPartitioner.class); } } else { //simple projection job.setMapperClass(Projection.SimpleProjectionMapper.class); } job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:gaffer.accumulo.bulkimport.BulkImportDriver.java
License:Apache License
public int run(String[] args) throws Exception { // Usage//from w w w . j av a 2 s .c o m if (args.length < 3) { System.err.println("Usage: " + BulkImportDriver.class.getName() + " <inputpath> <output_path> <accumulo_properties_file>"); return 1; } // Gets paths Path inputPath = new Path(args[0]); Path outputPath = new Path(args[1] + "/data_for_accumulo/"); Path splitsFilePath = new Path(args[1] + "/splits_file"); String accumuloPropertiesFile = args[2]; // Hadoop configuration Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); // Connect to Accumulo AccumuloConfig accConf = new AccumuloConfig(accumuloPropertiesFile); Connector conn = Accumulo.connect(accConf); String tableName = accConf.getTable(); // Check if the table exists if (!conn.tableOperations().exists(tableName)) { System.err.println("Table " + tableName + " does not exist - create the table before running this"); return 1; } // Get the current splits from the table. // (This assumes that we have already created the table using <code>InitialiseTable</code>.) Collection<Text> splits = conn.tableOperations().getSplits(tableName); int numSplits = splits.size(); System.out.println("Number of splits in table is " + numSplits); // Write current splits to a file (this is needed so that the following MapReduce // job can move them to the DistributedCache). IngestUtils.createSplitsFile(conn, tableName, fs, splitsFilePath); // Run MapReduce to output data suitable for bulk import to Accumulo // Conf and job conf.setBoolean("mapred.compress.map.output", true); conf.setClass("mapred.map.output.compression.codec", SnappyCodec.class, CompressionCodec.class); Job job = new Job(conf); job.setJarByClass(getClass()); job.setJobName("Convert data to Accumulo format: input = " + inputPath + ", output = " + outputPath); // Input job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, inputPath); // Mapper job.setMapperClass(BulkImportMapper.class); job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); // Partitioner job.setPartitionerClass(KeyRangePartitioner.class); KeyRangePartitioner.setSplitFile(job, splitsFilePath.toString()); // Reducer job.setReducerClass(BulkImportReducer.class); job.setOutputKeyClass(Key.class); job.setOutputValueClass(Value.class); job.setNumReduceTasks(numSplits + 1); // Output job.setOutputFormatClass(AccumuloFileOutputFormat.class); AccumuloFileOutputFormat.setOutputPath(job, outputPath); // Run job job.waitForCompletion(true); // Successful? if (!job.isSuccessful()) { System.err.println("Error running job"); return 1; } return 0; }
From source file:gaffer.accumulostore.operation.hdfs.handler.job.AccumuloAddElementsFromHdfsJobFactory.java
License:Apache License
private void setupPartioner(final Job job, final AddElementsFromHdfs operation, final AccumuloStore store) throws IOException { String splitsFilePath = operation.getOption(AccumuloStoreConstants.OPERATION_HDFS_SPLITS_FILE); int numReduceTasks; if (null == splitsFilePath || splitsFilePath.equals("")) { splitsFilePath = store.getProperties().getSplitsFilePath(); try {//from w w w . j av a2 s. co m numReduceTasks = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(), FileSystem.get(job.getConfiguration()), new Path(splitsFilePath)); } catch (final StoreException e) { throw new RuntimeException(e.getMessage(), e); } } else { numReduceTasks = IngestUtils.getNumSplits(FileSystem.get(job.getConfiguration()), new Path(splitsFilePath)); } job.setNumReduceTasks(numReduceTasks + 1); job.setPartitionerClass(KeyRangePartitioner.class); KeyRangePartitioner.setSplitFile(job, splitsFilePath); }
From source file:gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java
License:Apache License
private void setUpPartitionerGenerateSplitsFile(final Job job, final AddElementsFromHdfs operation, final AccumuloStore store) throws IOException { final String splitsFilePath = store.getProperties().getSplitsFilePath(); LOGGER.info("Creating splits file in location {} from table {}", splitsFilePath, store.getProperties().getTable()); final int maxReducers = intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS); final int minReducers = intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS); if (maxReducers != -1 && minReducers != -1) { if (minReducers > maxReducers) { LOGGER.error(// w ww . j a v a 2s . c o m "Minimum number of reducers must be less than the maximum number of reducers: minimum was {} " + "maximum was {}", minReducers, maxReducers); throw new IOException( "Minimum number of reducers must be less than the maximum number of reducers"); } } int numSplits; try { if (maxReducers == -1) { numSplits = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(), FileSystem.get(job.getConfiguration()), new Path(splitsFilePath)); } else { numSplits = IngestUtils.createSplitsFile(store.getConnection(), store.getProperties().getTable(), FileSystem.get(job.getConfiguration()), new Path(splitsFilePath), maxReducers - 1); } } catch (final StoreException e) { throw new RuntimeException(e.getMessage(), e); } int numReducers = numSplits + 1; LOGGER.info("Number of splits is {}; number of reducers is {}", numSplits, numReducers); // If neither min or max are specified then nothing to do; if max specified and min not then already taken care of. // If min is specified and the number of reducers is not greater than that then set the appropriate number of // subbins. if (minReducers != -1) { if (numReducers < minReducers) { LOGGER.info("Number of reducers is {} which is less than the specified minimum number of {}", numReducers, minReducers); int factor = (minReducers / numReducers) + 1; LOGGER.info("Setting number of subbins on KeyRangePartitioner to {}", factor); KeyRangePartitioner.setNumSubBins(job, factor); numReducers = numReducers * factor; LOGGER.info("Number of reducers is {}", numReducers); } } job.setNumReduceTasks(numReducers); job.setPartitionerClass(KeyRangePartitioner.class); KeyRangePartitioner.setSplitFile(job, splitsFilePath); }
From source file:gaffer.accumulostore.operation.hdfs.handler.job.factory.AccumuloAddElementsFromHdfsJobFactory.java
License:Apache License
private void setUpPartitionerFromUserProvidedSplitsFile(final Job job, final AddElementsFromHdfs operation) throws IOException { final String splitsFilePath = operation.getOption(AccumuloStoreConstants.OPERATION_HDFS_SPLITS_FILE); if (intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS) != -1 || intOptionIsValid(operation, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS) != -1) { LOGGER.info("Using splits file provided by user {}, ignoring options {} and {}", splitsFilePath, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MAX_REDUCERS, AccumuloStoreConstants.OPERATION_BULK_IMPORT_MIN_REDUCERS); } else {/* w w w. j a v a 2 s .c om*/ LOGGER.info("Using splits file provided by user {}", splitsFilePath); } final int numSplits = IngestUtils.getNumSplits(FileSystem.get(job.getConfiguration()), new Path(splitsFilePath)); job.setNumReduceTasks(numSplits + 1); job.setPartitionerClass(KeyRangePartitioner.class); KeyRangePartitioner.setSplitFile(job, splitsFilePath); }
From source file:gr.ntua.ece.cslab.modissense.queries.clients.GeneralHotIntQueryClient.java
@Override public void executeQuery() { try {//from ww w. ja v a2 s .co m if (this.createIfNotExist()) { //table exists Configuration conf = HBaseConfiguration.create(); Job job = new Job(conf, "Non personalized hotness interest"); job.setJarByClass(GeneralHotIntQueryClient.class); Scan scan = new Scan(); scan.setCaching(10000); scan.setFilter(new ColumnRangeFilter(Bytes.toBytes(startTimestamp), true, Bytes.toBytes(endTimestamp), true)); TableMapReduceUtil.initTableMapperJob(this.srcTable, // table name in bytes scan, // scanner to use GeneralHotIntQueryMapper.class, // mapper class LongWritable.class, // key class HotnessInterestWritable.class, // value class job); // job object TableMapReduceUtil.initTableReducerJob(this.targetTable, GeneralHotIntQueryReducer.class, job); job.setPartitionerClass(HashPartitioner.class); job.setCombinerClass(GeneralHotIntQueryCombiner.class); job.setNumReduceTasks(4); job.setOutputFormatClass(TableOutputFormat.class); job.waitForCompletion(true); } this.openConnection(targetTable); } catch (IOException | InterruptedException | ClassNotFoundException ex) { Logger.getLogger(GeneralHotIntQueryClient.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:gr.ntua.h2rdf.byteImport.HexastoreBulkImport.java
License:Open Source License
public Job createSubmittableJob(String[] args) { TABLE_NAME = args[1];/*from w ww . j av a 2 s . c o m*/ Job job = null; try { Configuration conf = new Configuration(); conf.addResource("hbase-default.xml"); conf.addResource("hbase-site.xml"); job = new Job(conf, NAME); job.setJarByClass(HexastoreBulkImport.class); job.setMapperClass(TotalOrderPrep.Map.class); job.setReducerClass(Reduce.class);//sampler.HamaReducer.class); job.setCombinerClass(Combiner.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(ImmutableBytesWritable.class); job.setPartitionerClass(TotalOrderPartitioner.class); //TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("/user/npapa/"+regions+"partitions/part-r-00000")); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("partitions/part-r-00000")); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(HFileOutputFormat.class); Path out = new Path("out"); FileOutputFormat.setOutputPath(job, out); FileSystem fs; try { fs = FileSystem.get(conf); if (fs.exists(out)) { fs.delete(out, true); } } catch (IOException e) { e.printStackTrace(); } // c.addResource(new Path("/0/arcomemDB/hadoop-0.20.2-cdh3u3/conf/hbase-site.xml")); HBaseAdmin hadmin = new HBaseAdmin(conf); HTableDescriptor desc = new HTableDescriptor(TABLE_NAME + "_stats"); HColumnDescriptor family = new HColumnDescriptor("size"); desc.addFamily(family); conf.setInt("zookeeper.session.timeout", 600000); if (hadmin.tableExists(TABLE_NAME + "_stats")) { //hadmin.disableTable(TABLE_NAME+"_stats"); //hadmin.deleteTable(TABLE_NAME+"_stats"); } else { hadmin.createTable(desc); } FileInputFormat.setInputPaths(job, new Path(args[0])); //job.getConfiguration().setInt("mapred.map.tasks", 18); job.getConfiguration().set("h2rdf.tableName", TABLE_NAME); job.getConfiguration().setInt("mapred.reduce.tasks", (int) TotalOrderPrep.regions); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().setInt("io.sort.mb", 100); job.getConfiguration().setInt("io.file.buffer.size", 131072); job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1); //job.getConfiguration().setInt("hbase.hregion.max.filesize", 67108864); job.getConfiguration().setInt("hbase.hregion.max.filesize", 33554432); //job.getConfiguration().setInt("io.sort.mb", 100); } catch (IOException e2) { e2.printStackTrace(); } return job; }
From source file:gr.ntua.h2rdf.inputFormat.TableMapReduceUtil.java
License:Open Source License
/** * Use this before submitting a TableReduce job. It will * appropriately set up the JobConf.//from w ww . j a v a2 s .c o m * * @param table The output table. * @param reducer The reducer class to use. * @param job The current job to adjust. * @param partitioner Partitioner to use. Pass <code>null</code> to use * default partitioner. * @throws IOException When determining the region count fails. */ public static void initTableReducerJob(String table, Class<? extends TableReducer> reducer, Job job, Class partitioner) throws IOException { job.setOutputFormatClass(TableOutputFormat.class); if (reducer != null) job.setReducerClass(reducer); job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, table); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Writable.class); if (partitioner == HRegionPartitioner.class) { job.setPartitionerClass(HRegionPartitioner.class); HTable outputTable = new HTable(new HBaseConfiguration(job.getConfiguration()), table); int regions = outputTable.getRegionsInfo().size(); if (job.getNumReduceTasks() > regions) { job.setNumReduceTasks(outputTable.getRegionsInfo().size()); } } else if (partitioner != null) { job.setPartitionerClass(partitioner); } }
From source file:gr.ntua.h2rdf.inputFormat2.TableMapReduceUtil.java
License:Open Source License
/** * Use this before submitting a TableReduce job. It will * appropriately set up the JobConf.//www . j a v a 2 s . c om * * @param table The output table. * @param reducer The reducer class to use. * @param job The current job to adjust. Make sure the passed job is * carrying all necessary HBase configuration. * @param partitioner Partitioner to use. Pass <code>null</code> to use * default partitioner. * @param quorumAddress Distant cluster to write to; default is null for * output to the cluster that is designated in <code>hbase-site.xml</code>. * Set this String to the zookeeper ensemble of an alternate remote cluster * when you would have the reduce write a cluster that is other than the * default; e.g. copying tables between clusters, the source would be * designated by <code>hbase-site.xml</code> and this param would have the * ensemble address of the remote cluster. The format to pass is particular. * Pass <code> <hbase.zookeeper.quorum>:<hbase.zookeeper.client.port>:<zookeeper.znode.parent> * </code> such as <code>server,server2,server3:2181:/hbase</code>. * @param serverClass redefined hbase.regionserver.class * @param serverImpl redefined hbase.regionserver.impl * @param addDependencyJars upload HBase jars and jars for any of the configured * job classes via the distributed cache (tmpjars). * @throws IOException When determining the region count fails. */ public static void initTableReducerJob(String table, Class<? extends TableReducer> reducer, Job job, Class partitioner, String quorumAddress, String serverClass, String serverImpl, boolean addDependencyJars) throws IOException { Configuration conf = job.getConfiguration(); HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf)); job.setOutputFormatClass(TableOutputFormat.class); if (reducer != null) job.setReducerClass(reducer); conf.set(TableOutputFormat.OUTPUT_TABLE, table); // If passed a quorum/ensemble address, pass it on to TableOutputFormat. if (quorumAddress != null) { // Calling this will validate the format ZKUtil.transformClusterKey(quorumAddress); conf.set(TableOutputFormat.QUORUM_ADDRESS, quorumAddress); } if (serverClass != null && serverImpl != null) { conf.set(TableOutputFormat.REGION_SERVER_CLASS, serverClass); conf.set(TableOutputFormat.REGION_SERVER_IMPL, serverImpl); } job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Writable.class); if (partitioner == HRegionPartitioner.class) { job.setPartitionerClass(HRegionPartitioner.class); HTable outputTable = new HTable(conf, table); int regions = outputTable.getRegionsInfo().size(); if (job.getNumReduceTasks() > regions) { job.setNumReduceTasks(outputTable.getRegionsInfo().size()); } } else if (partitioner != null) { job.setPartitionerClass(partitioner); } if (addDependencyJars) { addDependencyJars(job); } initCredentials(job); }
From source file:gr.ntua.h2rdf.LoadTriples.DistinctIds.java
License:Open Source License
public Job createSubmittableJob(String[] args) throws IOException, ClassNotFoundException { //io.compression.codecs Job job = new Job(); job.setInputFormatClass(TextInputFormat.class); Configuration conf = new Configuration(); Path blockProjection = new Path("blockIds/"); Path translations = new Path("translations/"); Path sample = new Path("sample/"); Path temp = new Path("temp/"); Path uniqueIds = new Path("uniqueIds/"); FileSystem fs;//w w w . j a v a 2 s .c o m try { fs = FileSystem.get(conf); if (fs.exists(uniqueIds)) { fs.delete(uniqueIds, true); } if (fs.exists(translations)) { fs.delete(translations, true); } if (fs.exists(blockProjection)) { fs.delete(blockProjection, true); } if (fs.exists(sample)) { fs.delete(sample, true); } if (fs.exists(temp)) { fs.delete(temp, true); } FileOutputFormat.setOutputPath(job, uniqueIds); Path inp = new Path(args[0]); FileInputFormat.setInputPaths(job, inp); double type = 1; double datasetSize = 0; if (fs.isFile(inp)) { datasetSize = fs.getFileStatus(inp).getLen(); } else if (fs.isDirectory(inp)) { FileStatus[] s = fs.listStatus(inp); for (int i = 0; i < s.length; i++) { if (s[i].getPath().getName().toString().endsWith(".gz")) type = 27; if (s[i].getPath().getName().toString().endsWith(".snappy")) type = 10; datasetSize += s[i].getLen(); } } else { FileStatus[] s = fs.globStatus(inp); for (int i = 0; i < s.length; i++) { if (s[i].getPath().getName().toString().endsWith(".gz")) type = 27; if (s[i].getPath().getName().toString().endsWith(".snappy")) type = 10; datasetSize += s[i].getLen(); } } datasetSize = datasetSize * type; System.out.println("type: " + type); System.out.println("datasetSize: " + datasetSize); samplingRate = (double) sampleChunk / (double) datasetSize; if (samplingRate >= 0.1) { samplingRate = 0.1; } if (samplingRate <= 0.001) { samplingRate = 0.001; } numReducers = (int) (datasetSize / ReducerChunk); if (numReducers == 0) numReducers = 1; numReducers++; } catch (IOException e) { e.printStackTrace(); } HBaseAdmin hadmin = new HBaseAdmin(conf); HTableDescriptor desc = new HTableDescriptor(TABLE_NAME); HColumnDescriptor family = new HColumnDescriptor("counter"); desc.addFamily(family); if (!hadmin.tableExists(TABLE_NAME)) { hadmin.createTable(desc); } job.setNumReduceTasks(numReducers); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(ImmutableBytesWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setJarByClass(DistinctIds.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setPartitionerClass(SamplingPartitioner.class); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.getConfiguration().set("mapred.compress.map.output", "true"); job.getConfiguration().set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //job.setCombinerClass(Combiner.class); job.setJobName("Distinct Id Wordcount"); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().setInt("io.sort.mb", 100); job.getConfiguration().setInt("io.file.buffer.size", 131072); job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1); return job; }