List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:com.datasalt.pangool.tuplemr.mapred.TestRollup.java
License:Apache License
@Test public void test1() throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException, TupleMRException { String input = TEST_OUT + "/input"; String output = TEST_OUT + "/output"; String[] inputElements = new String[] { "ES 20 listo 250", "US 14 beber 202", "US 14 perro 180", "US 14 perro 170", "US 15 jauja 160", "US 16 listo 160", "XE 20 listo 230" }; Schema schema = new Schema("schema", Fields.parse("country:string, age:int, name:string, height:int")); ITuple[] tuples = new ITuple[inputElements.length]; int i = 0;/*from w w w . j av a2s. c om*/ for (String inputElement : inputElements) { withInput(input, writable(inputElement)); tuples[i++] = createTuple(inputElement, schema); } Path outputPath = new Path(output); TupleMRBuilder builder = new TupleMRBuilder(getConf()); builder.addIntermediateSchema(schema); builder.setGroupByFields("country", "age", "name"); builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("age", Order.ASC).add("name", Order.ASC) .add("height", Order.DESC)); builder.setRollupFrom("country"); builder.setTupleReducer(new IdentityRed()); builder.setOutput(outputPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class, Text.class); builder.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new Map()); Job job = builder.createJob(); try { job.setNumReduceTasks(1); assertRun(job); } finally { builder.cleanUpInstanceFiles(); } FileSystem fs = FileSystem.get(getConf()); Path outputFile = new Path(output + "/part-r-00000"); checkRollupOutput(outputFile, 0, 2); SequenceFile.Reader reader = new SequenceFile.Reader(fs, outputFile, getConf()); assertOutput(reader, "OPEN 0", tuples[0]); assertOutput(reader, "OPEN 1", tuples[0]); assertOutput(reader, "OPEN 2", tuples[0]); assertOutput(reader, "ELEMENT", tuples[0]); assertOutput(reader, "CLOSE 2", tuples[0]); assertOutput(reader, "CLOSE 1", tuples[0]); assertOutput(reader, "CLOSE 0", tuples[0]); assertOutput(reader, "OPEN 0", tuples[1]); assertOutput(reader, "OPEN 1", tuples[1]); assertOutput(reader, "OPEN 2", tuples[1]); assertOutput(reader, "ELEMENT", tuples[1]); assertOutput(reader, "CLOSE 2", tuples[1]); assertOutput(reader, "OPEN 2", tuples[2]); assertOutput(reader, "ELEMENT", tuples[2]); assertOutput(reader, "ELEMENT", tuples[3]); assertOutput(reader, "CLOSE 2", tuples[3]); assertOutput(reader, "CLOSE 1", tuples[3]); assertOutput(reader, "OPEN 1", tuples[4]); assertOutput(reader, "OPEN 2", tuples[4]); assertOutput(reader, "ELEMENT", tuples[4]); assertOutput(reader, "CLOSE 2", tuples[4]); assertOutput(reader, "CLOSE 1", tuples[4]); assertOutput(reader, "OPEN 1", tuples[5]); assertOutput(reader, "OPEN 2", tuples[5]); assertOutput(reader, "ELEMENT", tuples[5]); assertOutput(reader, "CLOSE 2", tuples[5]); assertOutput(reader, "CLOSE 1", tuples[5]); assertOutput(reader, "CLOSE 0", tuples[5]); assertOutput(reader, "OPEN 0", tuples[6]); assertOutput(reader, "OPEN 1", tuples[6]); assertOutput(reader, "OPEN 2", tuples[6]); assertOutput(reader, "ELEMENT", tuples[6]); assertOutput(reader, "CLOSE 2", tuples[6]); assertOutput(reader, "CLOSE 1", tuples[6]); assertOutput(reader, "CLOSE 0", tuples[6]); reader.close(); cleanUp(); trash(TEST_OUT); }
From source file:com.datasalt.pangool.tuplemr.mapred.TestRollup.java
License:Apache License
@Test public void test2() throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException, TupleMRException { String input = TEST_OUT + "/input"; String output = TEST_OUT + "/output"; String[] inputElements = new String[] { "ES 20 listo 250", "US 14 beber 202", "US 14 perro 180", "US 14 perro 170", "US 15 jauja 160", "US 16 listo 160", "XE 16 listo 230" }; Schema schema = new Schema("schema", Fields.parse("country:string, age:int, name:string, height:int")); ITuple[] tuples = new ITuple[inputElements.length]; int i = 0;/*from w w w .java2 s .co m*/ for (String inputElement : inputElements) { withInput(input, writable(inputElement)); tuples[i++] = createTuple(inputElement, schema); } Path outputPath = new Path(output); TupleMRBuilder builder = new TupleMRBuilder(getConf()); builder.addIntermediateSchema(schema); builder.setGroupByFields("age", "name", "country"); builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("age", Order.ASC).add("name", Order.ASC) .add("height", Order.DESC)); builder.setRollupFrom("age"); builder.setTupleReducer(new IdentityRed()); builder.setOutput(outputPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class, Text.class); builder.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new Map()); Job job = builder.createJob(); try { job.setNumReduceTasks(1); assertRun(job); } finally { builder.cleanUpInstanceFiles(); } FileSystem fs = FileSystem.get(getConf()); Path outputFile = new Path(output + "/part-r-00000"); checkRollupOutput(outputFile, 1, 2); SequenceFile.Reader reader = new SequenceFile.Reader(fs, outputFile, getConf()); assertOutput(reader, "OPEN 1", tuples[0]); assertOutput(reader, "OPEN 2", tuples[0]); assertOutput(reader, "ELEMENT", tuples[0]); assertOutput(reader, "CLOSE 2", tuples[0]); assertOutput(reader, "CLOSE 1", tuples[0]); assertOutput(reader, "OPEN 1", tuples[1]); assertOutput(reader, "OPEN 2", tuples[1]); assertOutput(reader, "ELEMENT", tuples[1]); assertOutput(reader, "CLOSE 2", tuples[1]); assertOutput(reader, "OPEN 2", tuples[2]); assertOutput(reader, "ELEMENT", tuples[2]); assertOutput(reader, "ELEMENT", tuples[3]); assertOutput(reader, "CLOSE 2", tuples[3]); assertOutput(reader, "CLOSE 1", tuples[3]); assertOutput(reader, "OPEN 1", tuples[4]); assertOutput(reader, "OPEN 2", tuples[4]); assertOutput(reader, "ELEMENT", tuples[4]); assertOutput(reader, "CLOSE 2", tuples[4]); assertOutput(reader, "CLOSE 1", tuples[4]); assertOutput(reader, "OPEN 1", tuples[5]); assertOutput(reader, "OPEN 2", tuples[5]); assertOutput(reader, "ELEMENT", tuples[5]); assertOutput(reader, "CLOSE 2", tuples[5]); assertOutput(reader, "CLOSE 1", tuples[5]); assertOutput(reader, "OPEN 1", tuples[6]); assertOutput(reader, "OPEN 2", tuples[6]); assertOutput(reader, "ELEMENT", tuples[6]); assertOutput(reader, "CLOSE 2", tuples[6]); assertOutput(reader, "CLOSE 1", tuples[6]); reader.close(); cleanUp(); trash(TEST_OUT); }
From source file:com.datasalt.pangool.tuplemr.mapred.TestRollup.java
License:Apache License
/** * Tests the case in which the reducer receives no data. */// w w w. ja v a 2 s . c o m @Test public void testNoDataReducer() throws IOException, InterruptedException, ClassNotFoundException, InstantiationException, IllegalAccessException, TupleMRException { String input = TEST_OUT + "/input"; String output = TEST_OUT + "/output"; withInput(input, writable("ES 20 listo 250")); Schema schema = new Schema("schema", Fields.parse("country:string, age:int, name:string, height:int")); Path outputPath = new Path(output); TupleMRBuilder builder = new TupleMRBuilder(getConf()); builder.addIntermediateSchema(schema); builder.setGroupByFields("age", "name", "country"); builder.setOrderBy(new OrderBy().add("country", Order.ASC).add("age", Order.ASC).add("name", Order.ASC)); builder.setRollupFrom("age"); builder.setTupleReducer(new IdentityRed()); builder.setOutput(outputPath, new HadoopOutputFormat(SequenceFileOutputFormat.class), Text.class, Text.class); builder.addInput(new Path(input), new HadoopInputFormat(SequenceFileInputFormat.class), new DoNothingMap()); Job job = builder.createJob(); try { job.setNumReduceTasks(1); assertRun(job); } finally { builder.cleanUpInstanceFiles(); } cleanUp(); trash(TEST_OUT); }
From source file:com.datasalt.pangool.tuplemr.mapred.TestTupleMRJob.java
License:Apache License
@Test public void testFillingTuplesJob() throws IOException, ClassNotFoundException, InterruptedException, TupleMRException { int NUM_ROWS_TO_GENERATE = 100; Configuration conf = getConf(); String input = TestTupleMRJob.class + "-input"; String output = TestTupleMRJob.class + "-output"; ITuple tuple = new Tuple(SCHEMA); for (int i = 0; i < NUM_ROWS_TO_GENERATE; i++) { withTupleInput(input, fillTuple(true, tuple)); }//w ww .ja v a 2s . c o m TupleMRBuilder builder = new TupleMRBuilder(getConf(), "test"); builder.addTupleInput(new Path(input), new TupleMapper<ITuple, NullWritable>() { @Override public void map(ITuple iTuple, NullWritable nullWritable, TupleMRContext context, Collector collector) throws IOException, InterruptedException { collector.write(fillTuple(true, iTuple)); } }); builder.setTupleReducer(new TupleReducer<ITuple, NullWritable>() { @Override public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector) throws IOException, InterruptedException, TupleMRException { for (ITuple tuple : tuples) { collector.write(fillTuple(true, tuple), NullWritable.get()); } } }); builder.addIntermediateSchema(SCHEMA); builder.setGroupByFields(SCHEMA.getField(0).getName()); builder.setTupleOutput(new Path(output), SCHEMA); Job job = builder.createJob(); job.setNumReduceTasks(1); try { assertRun(job); } finally { builder.cleanUpInstanceFiles(); } final AtomicInteger count = new AtomicInteger(); readTuples(new Path(output + "/part-r-00000"), getConf(), new TupleVisitor() { @Override public void onTuple(ITuple tuple) { count.incrementAndGet(); } }); assertEquals(NUM_ROWS_TO_GENERATE, count.get()); trash(input); trash(output); }
From source file:com.datasalt.pangool.tuplemr.mapred.TestTupleMRJob.java
License:Apache License
@Test public void testJobWithNulls() throws IOException, TupleMRException, ClassNotFoundException, InterruptedException { Configuration conf = getConf(); String input1 = TestTupleMRJob.class.getCanonicalName() + "-input1"; String input2 = TestTupleMRJob.class.getCanonicalName() + "-input2"; String output = TestTupleMRJob.class.getCanonicalName() + "-output"; final Schema schemaNoNulls = new Schema("NoNulls", Fields.parse("f1:int,f2:string")); final Schema schemaNulls = new Schema("Nulls", Fields.parse("f1:int?,f2:string?")); Tuple t1 = new Tuple(schemaNoNulls); Tuple t2 = new Tuple(schemaNulls); t1.set(0, 0);//from www . j a v a 2 s . com t1.set(1, "nn"); withTupleInput(input1, t1); Object tuples[][] = new Object[][] { new Object[] { 0, null }, new Object[] { 0, "n1" }, new Object[] { null, "n2" } }; for (Object[] tuple : tuples) { t2.set(0, tuple[0]); t2.set(1, tuple[1]); withTupleInput(input2, t2); } TupleMRBuilder builder = new TupleMRBuilder(getConf(), "test"); builder.addTupleInput(new Path(input1), new IdentityTupleMapper()); builder.addTupleInput(new Path(input2), new IdentityTupleMapper()); builder.setTupleReducer(new TupleReducer<ITuple, NullWritable>() { @Override public void reduce(ITuple group, Iterable<ITuple> tuples, TupleMRContext context, Collector collector) throws IOException, InterruptedException, TupleMRException { int count = 0; for (ITuple tuple : tuples) { Tuple t = new Tuple(schemaNulls); t.set(0, tuple.get(0)); t.set(1, tuple.get(1)); collector.write(t, NullWritable.get()); count++; } if (group.get(0) == null) { assertEquals(1, count); } else if (((Integer) group.get(0)) == 0) { assertEquals(3, count); } } }); builder.addIntermediateSchema(schemaNoNulls); builder.addIntermediateSchema(schemaNulls); builder.setGroupByFields("f1"); builder.setOrderBy(OrderBy.parse("f1:desc|null_smallest").addSchemaOrder(Criteria.Order.ASC)); builder.setSpecificOrderBy("NoNulls", OrderBy.parse("f2:asc|null_biggest")); builder.setSpecificOrderBy("Nulls", OrderBy.parse("f2:asc|null_biggest")); builder.setTupleOutput(new Path(output), schemaNulls); Job job = builder.createJob(); job.setNumReduceTasks(1); try { assertRun(job); } finally { builder.cleanUpInstanceFiles(); } final Object expectedOutput[][] = new Object[][] { new Object[] { 0, "nn" }, new Object[] { 0, "n1" }, new Object[] { 0, null }, new Object[] { null, "n2" } }; boolean debug = false; if (debug) { readTuples(new Path(output + "/part-r-00000"), getConf(), new TupleVisitor() { @Override public void onTuple(ITuple t) { System.out.println(t); } }); } readTuples(new Path(output + "/part-r-00000"), getConf(), new TupleVisitor() { int i = 0; @Override public void onTuple(ITuple t) { assertEqualsNull(expectedOutput[i][0], t.get(0)); Object f2 = t.get(1); f2 = (f2 != null) ? f2.toString() : f2; assertEqualsNull(expectedOutput[i][1], f2); i++; } }); trash(input1); trash(input2); trash(output); }
From source file:com.declum.squzer.example.hbase.table2file.Export.java
License:Apache License
/** * Sets up the actual job./*from w w w . j a va 2 s . co m*/ * * @param conf * The current configuration. * @param args * The command line parameters. * @return The newly created job. * @throws IOException * When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { String tableName = args[0]; Path outputDir = new Path(args[1]); Job job = Job.getInstance(conf); job.setJobName(tableName); job.setJobName(NAME + "_" + tableName); job.setJarByClass(Exporter.class); // TODO: Allow passing filter and subset of rows/columns. Scan s = new Scan(); // Optional arguments. int versions = args.length > 2 ? Integer.parseInt(args[2]) : 1; s.setMaxVersions(versions); long startTime = args.length > 3 ? Long.parseLong(args[3]) : 0L; long endTime = args.length > 4 ? Long.parseLong(args[4]) : Long.MAX_VALUE; s.setTimeRange(startTime, endTime); s.setCacheBlocks(false); if (conf.get(TableInputFormat.SCAN_COLUMN_FAMILY) != null) { s.addFamily(Bytes.toBytes(conf.get(TableInputFormat.SCAN_COLUMN_FAMILY))); } LOG.info("verisons=" + versions + ", starttime=" + startTime + ", endtime=" + endTime); TableMapReduceUtil.initTableMapperJob(tableName, s, Exporter.class, null, null, job); // No reducers. Just write straight to output files. job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(Result.class); FileOutputFormat.setOutputPath(job, outputDir); return job; }
From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java
License:Apache License
/** * Convert the input documents into token array using the * {@link StringTuple} The input documents has to be in the * {@link org.apache.hadoop.io.SequenceFile} format * /* w ww .j a v a 2 s. com*/ * @param input * input directory of the documents in * {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the {@link StringTuple} token array of * each document has to be created * @param type * The annotation type representing the tokens * @param feature * The name of the features holding the token value * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void tokenizeDocuments(Path input, String type, String feature, Path output) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); // this conf parameter needs to be set enable serialisation of conf // values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.set(TOKEN_TYPE, type); conf.set(FEATURE_NAME, feature); Job job = new Job(conf); job.setJobName("DocumentProcessor::BehemothTokenizer: input-folder: " + input); job.setJarByClass(BehemothDocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(BehemothTokenizerMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java
License:Apache License
/** * Convert the input documents into token array using the * {@link StringTuple} The input documents has to be in the * {@link org.apache.hadoop.io.SequenceFile} format * /* w w w. j a va 2 s .c om*/ * @param input * input directory of the documents in * {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the {@link StringTuple} token array of * each document has to be created * @param analyzerClass * The Lucene {@link Analyzer} for tokenizing the UTF-8 text */ public static void tokenizeDocuments(Path input, Class<? extends Analyzer> analyzerClass, Path output, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf // values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.set(ANALYZER_CLASS, analyzerClass.getName()); Job job = new Job(conf); job.setJobName("DocumentProcessor::LuceneTokenizer: input-folder: " + input); job.setJarByClass(BehemothDocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(LuceneTokenizerMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.digitalpebble.behemoth.mahout.BehemothDocumentProcessor.java
License:Apache License
public static void dumpLabels(Path input, Path output, Configuration baseConf) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(baseConf); // this conf parameter needs to be set enable serialisation of conf // values/* w w w. j a v a 2 s. c o m*/ conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); Job job = new Job(conf); job.setJobName("DocumentProcessor::LabelDumper: input-folder: " + input); job.setJarByClass(BehemothDocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(BehemothLabelMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); boolean succeeded = job.waitForCompletion(true); if (!succeeded) throw new IllegalStateException("Job failed!"); }
From source file:com.digitalpebble.behemoth.mahout.DocumentProcessor.java
License:Apache License
/** * Convert the input documents into token array using the * {@link StringTuple} The input documents has to be in the * {@link org.apache.hadoop.io.SequenceFile} format * /* w w w . j a v a 2 s . co m*/ * @param input * input directory of the documents in * {@link org.apache.hadoop.io.SequenceFile} format * @param output * output directory were the {@link StringTuple} token array of * each document has to be created * @param type * The annotation type representing the tokens * @param feature * The name of the features holding the token value * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void tokenizeDocuments(Path input, String type, String feature, Path output) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); // this conf parameter needs to be set enable serialisation of conf // values conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); conf.set(TOKEN_TYPE, type); conf.set(FEATURE_NAME, feature); Job job = new Job(conf); job.setJobName("DocumentProcessor::DocumentTokenizer: input-folder: " + input); job.setJarByClass(DocumentProcessor.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StringTuple.class); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(SequenceFileTokenizerMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); HadoopUtil.delete(conf, output); job.waitForCompletion(true); }