List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:org.apache.hadoop.examples.SecondarySort.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: secondarysort <in> <out>"); System.exit(2);//from w w w. j a v a 2 s .c om } Job job = Job.getInstance(conf, "secondary sort"); job.setJarByClass(SecondarySort.class); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); // group and partition by the first int in the pair job.setPartitionerClass(FirstPartitioner.class); job.setGroupingComparatorClass(FirstGroupingComparator.class); // the map output is IntPair, IntWritable job.setMapOutputKeyClass(IntPair.class); job.setMapOutputValueClass(IntWritable.class); // the reduce output is Text, IntWritable job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:org.apache.hadoop.examples.terasort.TeraSort.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 2) { usage();/*from w w w.j a va 2 s .c o m*/ return 2; } LOG.info("starting"); Job job = Job.getInstance(getConf()); Path inputDir = new Path(args[0]); Path outputDir = new Path(args[1]); boolean useSimplePartitioner = getUseSimplePartitioner(job); TeraInputFormat.setInputPaths(job, inputDir); FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TeraInputFormat.class); job.setOutputFormatClass(TeraOutputFormat.class); if (useSimplePartitioner) { job.setPartitionerClass(SimplePartitioner.class); } else { long start = System.currentTimeMillis(); Path partitionFile = new Path(outputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); try { TeraInputFormat.writePartitionFile(job, partitionFile); } catch (Throwable e) { LOG.error(e.getMessage()); return -1; } job.addCacheFile(partitionUri); long end = System.currentTimeMillis(); System.out.println("Spent " + (end - start) + "ms computing partitions."); job.setPartitionerClass(TotalOrderPartitioner.class); } job.getConfiguration().setInt("dfs.replication", getOutputReplication(job)); int ret = job.waitForCompletion(true) ? 0 : 1; LOG.info("done"); return ret; }
From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopMapReduceEmbeddedSelfTest.java
License:Apache License
/** * Tests whole job execution with all phases in old and new versions of API with definition of custom * Serialization, Partitioner and IO formats. * @throws Exception If fails./*from w w w .j ava 2 s .c o m*/ */ public void testMultiReducerWholeMapReduceExecution() throws Exception { IgfsPath inDir = new IgfsPath(PATH_INPUT); igfs.mkdirs(inDir); IgfsPath inFile = new IgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5", 12000, "key6", 18000); for (int i = 0; i < 2; i++) { boolean useNewAPI = i == 1; igfs.delete(new IgfsPath(PATH_OUTPUT), true); flags.put("serializationWasConfigured", false); flags.put("partitionerWasConfigured", false); flags.put("inputFormatWasConfigured", false); flags.put("outputFormatWasConfigured", false); JobConf jobConf = new JobConf(); jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); //To split into about 6-7 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); GridHadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI); if (!useNewAPI) { jobConf.setPartitionerClass(CustomV1Partitioner.class); jobConf.setInputFormat(CustomV1InputFormat.class); jobConf.setOutputFormat(CustomV1OutputFormat.class); } Job job = Job.getInstance(jobConf); GridHadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI); if (useNewAPI) { job.setPartitionerClass(CustomV2Partitioner.class); job.setInputFormatClass(CustomV2InputFormat.class); job.setOutputFormatClass(CustomV2OutputFormat.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setNumReduceTasks(3); job.setJarByClass(GridHadoopWordCount2.class); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration())); fut.get(); assertTrue("Serialization was configured (new API is " + useNewAPI + ")", flags.get("serializationWasConfigured")); assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")", flags.get("partitionerWasConfigured")); assertTrue("Input format was configured (new API is = " + useNewAPI + ")", flags.get("inputFormatWasConfigured")); assertTrue("Output format was configured (new API is = " + useNewAPI + ")", flags.get("outputFormatWasConfigured")); assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000")); assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001")); assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002")); } }
From source file:org.apache.ignite.internal.processors.hadoop.HadoopMapReduceEmbeddedSelfTest.java
License:Apache License
/** * Tests whole job execution with all phases in old and new versions of API with definition of custom * Serialization, Partitioner and IO formats. * @throws Exception If fails./*from w ww.ja v a 2s. co m*/ */ public void testMultiReducerWholeMapReduceExecution() throws Exception { IgfsPath inDir = new IgfsPath(PATH_INPUT); igfs.mkdirs(inDir); IgfsPath inFile = new IgfsPath(inDir, HadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5", 12000, "key6", 18000); for (int i = 0; i < 2; i++) { boolean useNewAPI = i == 1; igfs.delete(new IgfsPath(PATH_OUTPUT), true); flags.put("serializationWasConfigured", false); flags.put("partitionerWasConfigured", false); flags.put("inputFormatWasConfigured", false); flags.put("outputFormatWasConfigured", false); JobConf jobConf = new JobConf(); jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); //To split into about 6-7 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); HadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI); if (!useNewAPI) { jobConf.setPartitionerClass(CustomV1Partitioner.class); jobConf.setInputFormat(CustomV1InputFormat.class); jobConf.setOutputFormat(CustomV1OutputFormat.class); } Job job = Job.getInstance(jobConf); HadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI); if (useNewAPI) { job.setPartitionerClass(CustomV2Partitioner.class); job.setInputFormatClass(CustomV2InputFormat.class); job.setOutputFormatClass(CustomV2OutputFormat.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setNumReduceTasks(3); job.setJarByClass(HadoopWordCount2.class); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration())); fut.get(); assertTrue("Serialization was configured (new API is " + useNewAPI + ")", flags.get("serializationWasConfigured")); assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")", flags.get("partitionerWasConfigured")); assertTrue("Input format was configured (new API is = " + useNewAPI + ")", flags.get("inputFormatWasConfigured")); assertTrue("Output format was configured (new API is = " + useNewAPI + ")", flags.get("outputFormatWasConfigured")); assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000")); assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001")); assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002")); } }
From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopMapReduceEmbeddedSelfTest.java
License:Apache License
/** * Tests whole job execution with all phases in old and new versions of API with definition of custom * Serialization, Partitioner and IO formats. * * @param striped Whether output should be striped or not. * @throws Exception If fails./* www .j ava 2s . c om*/ */ public void checkMultiReducerWholeMapReduceExecution(boolean striped) throws Exception { IgfsPath inDir = new IgfsPath(PATH_INPUT); igfs.mkdirs(inDir); IgfsPath inFile = new IgfsPath(inDir, HadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5", 12000, "key6", 18000); for (int i = 0; i < 2; i++) { boolean useNewAPI = i == 1; igfs.delete(new IgfsPath(PATH_OUTPUT), true); flags.put("serializationWasConfigured", false); flags.put("partitionerWasConfigured", false); flags.put("inputFormatWasConfigured", false); flags.put("outputFormatWasConfigured", false); JobConf jobConf = new JobConf(); if (striped) jobConf.set(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), "true"); else jobConf.set(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), "false"); jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); //To split into about 6-7 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); HadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI); if (!useNewAPI) { jobConf.setPartitionerClass(CustomV1Partitioner.class); jobConf.setInputFormat(CustomV1InputFormat.class); jobConf.setOutputFormat(CustomV1OutputFormat.class); } Job job = Job.getInstance(jobConf); HadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI, false); if (useNewAPI) { job.setPartitionerClass(CustomV2Partitioner.class); job.setInputFormatClass(CustomV2InputFormat.class); job.setOutputFormatClass(CustomV2OutputFormat.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setNumReduceTasks(3); job.setJarByClass(HadoopWordCount2.class); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration())); fut.get(); assertTrue("Serialization was configured (new API is " + useNewAPI + ")", flags.get("serializationWasConfigured")); assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")", flags.get("partitionerWasConfigured")); assertTrue("Input format was configured (new API is = " + useNewAPI + ")", flags.get("inputFormatWasConfigured")); assertTrue("Output format was configured (new API is = " + useNewAPI + ")", flags.get("outputFormatWasConfigured")); assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000")); assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001")); assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002")); } }
From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopTeraSortTest.java
License:Apache License
/** * Creates Job instance and sets up necessary properties for it. * @param conf The Job config.// www . j ava 2s . c om * @return The job. * @throws Exception On error. */ private Job setupConfig(JobConf conf) throws Exception { Job job = Job.getInstance(conf); Path inputDir = new Path(generateOutDir); Path outputDir = new Path(sortOutDir); boolean useSimplePartitioner = TeraSort.getUseSimplePartitioner(job); TeraInputFormat.setInputPaths(job, inputDir); FileOutputFormat.setOutputPath(job, outputDir); job.setJobName("TeraSort"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TeraInputFormat.class); job.setOutputFormatClass(TeraOutputFormat.class); if (useSimplePartitioner) job.setPartitionerClass(TeraSort.SimplePartitioner.class); else { long start = System.currentTimeMillis(); Path partFile = new Path(outputDir, PARTITION_FILENAME); URI partUri = new URI(partFile.toString() + "#" + PARTITION_FILENAME); try { TeraInputFormat.writePartitionFile(job, partFile); } catch (Throwable e) { throw new RuntimeException(e); } job.addCacheFile(partUri); long end = System.currentTimeMillis(); System.out.println("Spent " + (end - start) + "ms computing partitions. " + "Partition file added to distributed cache: " + partUri); job.setPartitionerClass(getTeraSortTotalOrderPartitioner()/*TeraSort.TotalOrderPartitioner.class*/); } job.getConfiguration().setInt("dfs.replication", TeraSort.getOutputReplication(job)); /* TeraOutputFormat.setFinalSync(job, true); */ Method m = TeraOutputFormat.class.getDeclaredMethod("setFinalSync", JobContext.class, boolean.class); m.setAccessible(true); m.invoke(null, job, true); return job; }
From source file:org.apache.jena.tdbloader4.Utils.java
License:Apache License
public static void setReducers(Job job, Configuration configuration, Logger log) { boolean runLocal = configuration.getBoolean(Constants.OPTION_RUN_LOCAL, Constants.OPTION_RUN_LOCAL_DEFAULT); int num_reducers = configuration.getInt(Constants.OPTION_NUM_REDUCERS, Constants.OPTION_NUM_REDUCERS_DEFAULT); // TODO: should we comment this out and let Hadoop decide the number of reducers? if (runLocal) { if (log != null) log.debug("Setting number of reducers to {}", 1); job.setNumReduceTasks(1);//from ww w . ja va2 s . com } else { if (Constants.NAME_FOURTH.equals(job.getJobName())) { job.setPartitionerClass(TotalOrderPartitioner.class); num_reducers = 9 * num_reducers; } job.setNumReduceTasks(num_reducers); if (log != null) log.debug("Setting number of reducers to {}", num_reducers); } }
From source file:org.apache.kylin.storage.hbase.steps.HFileOutputFormat3.java
License:Apache License
/** * Configure <code>job</code> with a TotalOrderPartitioner, partitioning against * <code>splitPoints</code>. Cleans up the partitions file after job exists. *//* ww w . j a v a 2s . com*/ static void configurePartitioner(Job job, List<ImmutableBytesWritable> splitPoints) throws IOException { Configuration conf = job.getConfiguration(); // create the partitions file FileSystem fs = FileSystem.get(conf); Path partitionsPath = new Path(conf.get("hbase.fs.tmp.dir"), "partitions_" + RandomUtil.randomUUID()); fs.makeQualified(partitionsPath); writePartitions(conf, partitionsPath, splitPoints); fs.deleteOnExit(partitionsPath); // configure job to use it job.setPartitionerClass(TotalOrderPartitioner.class); TotalOrderPartitioner.setPartitionFile(conf, partitionsPath); }
From source file:org.apache.mahout.cf.taste.hadoop.als.PredictionJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addOption("pairs", "p", "path containing the test ratings, each line must be: userID,itemID", true); addOption("userFeatures", "u", "path to the user feature matrix", true); addOption("itemFeatures", "i", "path to the item feature matrix", true); addOutputOption();//from ww w . j a v a 2s. co m Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path pairs = new Path(parsedArgs.get("--pairs")); Path userFeatures = new Path(parsedArgs.get("--userFeatures")); Path itemFeatures = new Path(parsedArgs.get("--itemFeatures")); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); Path convertedPairs = new Path(tempDirPath, "convertedPairs"); Path convertedUserFeatures = new Path(tempDirPath, "convertedUserFeatures"); Path convertedItemFeatures = new Path(tempDirPath, "convertedItemFeatures"); Path pairsJoinedWithItemFeatures = new Path(tempDirPath, "pairsJoinedWithItemFeatures"); Job convertPairs = prepareJob(pairs, convertedPairs, TextInputFormat.class, PairsMapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class); convertPairs.waitForCompletion(true); Job convertUserFeatures = prepareJob(userFeatures, convertedUserFeatures, SequenceFileInputFormat.class, FeaturesMapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class); convertUserFeatures.waitForCompletion(true); Job convertItemFeatures = prepareJob(itemFeatures, convertedItemFeatures, SequenceFileInputFormat.class, FeaturesMapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, Reducer.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class); convertItemFeatures.waitForCompletion(true); Job joinPairsWithItemFeatures = prepareJob(new Path(convertedPairs + "," + convertedItemFeatures), pairsJoinedWithItemFeatures, SequenceFileInputFormat.class, Mapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, JoinProbesWithItemFeaturesReducer.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, SequenceFileOutputFormat.class); joinPairsWithItemFeatures.setPartitionerClass(HashPartitioner.class); joinPairsWithItemFeatures.setGroupingComparatorClass(TaggedVarIntWritable.GroupingComparator.class); joinPairsWithItemFeatures.waitForCompletion(true); Job predictRatings = prepareJob(new Path(pairsJoinedWithItemFeatures + "," + convertedUserFeatures), getOutputPath(), SequenceFileInputFormat.class, Mapper.class, TaggedVarIntWritable.class, VectorWithIndexWritable.class, PredictRatingReducer.class, Text.class, NullWritable.class, TextOutputFormat.class); predictRatings.setPartitionerClass(HashPartitioner.class); predictRatings.setGroupingComparatorClass(TaggedVarIntWritable.GroupingComparator.class); predictRatings.waitForCompletion(true); return 0; }
From source file:org.apache.mahout.classifier.svm.mapreduce.MapReduceUtil.java
License:Apache License
public static void setJobPartitioner(Job job, Class<? extends Partitioner> partitioner) { job.setPartitionerClass(partitioner); }