List of usage examples for org.apache.hadoop.mapreduce Job getMapOutputKeyClass
public Class<?> getMapOutputKeyClass()
From source file:ComRoughSetApproInputSampler.java
License:Apache License
/** * Write a partition file for the given job, using the Sampler provided. * Queries the sampler for a sample keyset, sorts by the output key * comparator, selects the keys for each rank, and writes to the destination * returned from {@link TotalOrderPartitioner#getPartitionFile}. *//*w w w .ja v a 2 s . c o m*/ @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = job.getConfiguration(); final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); int numPartitions = job.getNumReduceTasks(); K[] samples = (K[]) sampler.getSample(inf, job); LOG.info("Using " + samples.length + " samples"); RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf)); FileSystem fs = dst.getFileSystem(conf); if (fs.exists(dst)) { fs.delete(dst, false); } SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class); NullWritable nullValue = NullWritable.get(); float stepSize = samples.length / (float) numPartitions; int last = -1; for (int i = 1; i < numPartitions; ++i) { int k = Math.round(stepSize * i); while (last >= k && comparator.compare(samples[last], samples[k]) == 0) { ++k; } writer.append(samples[k], nullValue); last = k; } writer.close(); }
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
private void runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException { assert job.getJobID() != null; TaskID taskId = newMapTaskId(job.getJobID(), 0); Configuration conf = job.getConfiguration(); OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf); OutputCommitter committer = output/* ww w . j a v a 2 s . c om*/ .getOutputCommitter(newTaskAttemptContext(conf, newTaskAttemptId(taskId, 0))); boolean succeed = false; committer.setupJob(job); try { if (job.getNumReduceTasks() == 0) { runMap(job, null); } else { try (KeyValueSorter<?, ?> sorter = createSorter(job, job.getMapOutputKeyClass(), job.getMapOutputValueClass())) { runMap(job, sorter); runReduce(job, sorter); } } committer.commitJob(job); succeed = true; } finally { if (succeed == false) { try { committer.abortJob(job, State.FAILED); } catch (IOException e) { LOG.error(MessageFormat.format("error occurred while aborting job: {0} ({1})", job.getJobID(), job.getJobName()), e); } } } }
From source file:com.baynote.kafka.hadoop.KafkaJobBuilderTest.java
License:Apache License
@Test public void testConfigureWholeJob() throws Exception { // base configuration builder.setZkConnect("localhost:2181"); builder.addQueueInput("queue_name", "group_name", MockMapper.class); builder.setTextFileOutputFormat("/a/hdfs/path"); // extended configuration builder.setJobName("job_name"); builder.setMapOutputKeyClass(Text.class); builder.setMapOutputValueClass(BytesWritable.class); builder.setReducerClass(MockReducer.class); builder.setTaskMemorySettings("-Xmx2048m"); builder.setNumReduceTasks(100);// ww w.j a v a2 s .c o m builder.setParitioner(MockPartitioner.class); builder.setKafkaFetchSizeBytes(1024); Job job = builder.configureJob(conf); assertEquals("job_name", job.getJobName()); assertEquals(Text.class, job.getMapOutputKeyClass()); assertEquals(BytesWritable.class, job.getMapOutputValueClass()); assertEquals(MockReducer.class, job.getReducerClass()); assertEquals(MockMapper.class, job.getMapperClass()); assertEquals("-Xmx2048m", job.getConfiguration().get("mapred.child.java.opts")); assertEquals(100, job.getNumReduceTasks()); assertEquals(MockPartitioner.class, job.getPartitionerClass()); assertEquals(1024, KafkaInputFormat.getKafkaFetchSizeBytes(job.getConfiguration())); assertEquals(TextOutputFormat.class, job.getOutputFormatClass()); assertEquals(KafkaInputFormat.class, job.getInputFormatClass()); assertEquals("file:/a/hdfs/path", TextOutputFormat.getOutputPath(job).toString()); builder.setJobName(null); builder.setSequenceFileOutputFormat(); builder.setUseLazyOutput(); builder.addQueueInput("queue_name_2", "group_name_2", MockMapper.class); job = builder.configureJob(conf); assertEquals(LazyOutputFormat.class, job.getOutputFormatClass()); assertEquals(MultipleKafkaInputFormat.class, job.getInputFormatClass()); assertEquals(DelegatingMapper.class, job.getMapperClass()); assertEquals(BytesWritable.class, job.getOutputKeyClass()); assertEquals(BytesWritable.class, job.getOutputValueClass()); assertNotNull(SequenceFileOutputFormat.getOutputPath(job)); assertNotNull(job.getJobName()); // use s3 builder.useS3("my_aws_key", "s3cr3t", "my-bucket"); builder.setTextFileOutputFormat("/a/hdfs/path"); job = builder.configureJob(conf); assertEquals("my_aws_key", job.getConfiguration().get("fs.s3n.awsAccessKeyId")); assertEquals("s3cr3t", job.getConfiguration().get("fs.s3n.awsSecretAccessKey")); assertEquals("my_aws_key", job.getConfiguration().get("fs.s3.awsAccessKeyId")); assertEquals("s3cr3t", job.getConfiguration().get("fs.s3.awsSecretAccessKey")); }
From source file:com.cloudera.castagna.logparser.Utils.java
License:Apache License
public static void log(Job job, Logger log) throws ClassNotFoundException { log.debug("{} -> {} ({}, {}) -> {}#{} ({}, {}) -> {}", new Object[] { job.getInputFormatClass().getSimpleName(), job.getMapperClass().getSimpleName(), job.getMapOutputKeyClass().getSimpleName(), job.getMapOutputValueClass().getSimpleName(), job.getReducerClass().getSimpleName(), job.getNumReduceTasks(), job.getOutputKeyClass().getSimpleName(), job.getOutputValueClass().getSimpleName(), job.getOutputFormatClass().getSimpleName() }); Path[] inputs = FileInputFormat.getInputPaths(job); Path output = FileOutputFormat.getOutputPath(job); log.debug("input: {}", inputs[0]); log.debug("output: {}", output); }
From source file:com.cloudera.spark.bulkload.TotalOrderPartitioner.java
License:Apache License
/** * Read in the partition file and build indexing data structures. * If the keytype is {@link BinaryComparable} and * <tt>total.order.partitioner.natural.order</tt> is not false, a trie * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes * will be built. Otherwise, keys will be located using a binary search of * the partition keyset using the {@link RawComparator} * defined for this job. The input file must be sorted with the same * comparator and contain {@link Job#getNumReduceTasks()} - 1 keys. */// w w w. j a v a 2 s . co m @SuppressWarnings("unchecked") // keytype from conf not static public void setConf(Configuration conf) { try { this.conf = conf; String parts = getPartitionFile(conf); final Path partFile = new Path(parts); final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache : partFile.getFileSystem(conf); Job job = new Job(conf); Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass(); K[] splitPoints = readPartitions(fs, partFile, keyClass, conf); if (splitPoints.length != job.getNumReduceTasks() - 1) { throw new IOException("Wrong number of partitions in keyset"); } RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator(); for (int i = 0; i < splitPoints.length - 1; ++i) { if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) { throw new IOException("Split points are out of order"); } } boolean natOrder = conf.getBoolean(NATURAL_ORDER, true); if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) { partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0], // Now that blocks of identical splitless trie nodes are // represented reentrantly, and we develop a leaf for any trie // node with only one split point, the only reason for a depth // limit is to refute stack overflow or bloat in the pathological // case where the split points are long and mostly look like bytes // iii...iixii...iii . Therefore, we make the default depth // limit large but not huge. conf.getInt(MAX_TRIE_DEPTH, 200)); } else { partitions = new BinarySearchNode(splitPoints, comparator); } } catch (IOException e) { throw new IllegalArgumentException("Can't read partitions file", e); } }
From source file:com.example.Driver.java
License:Open Source License
public int run(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, "Your job name"); job.setJarByClass(Driver.class); logger.info("job " + job.getJobName() + " [" + job.getJar() + "] started with the following arguments: " + Arrays.toString(args)); if (args.length < 2) { logger.warn("to run this jar are necessary at 2 parameters \"" + job.getJar() + " input_files output_directory"); return 1; }// ww w.j a v a 2s . com job.setMapperClass(WordcountMapper.class); logger.info("mapper class is " + job.getMapperClass()); //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(IntWritable.class); logger.info("mapper output key class is " + job.getMapOutputKeyClass()); logger.info("mapper output value class is " + job.getMapOutputValueClass()); job.setReducerClass(WordcountReducer.class); logger.info("reducer class is " + job.getReducerClass()); job.setCombinerClass(WordcountReducer.class); logger.info("combiner class is " + job.getCombinerClass()); //When you are not runnign any Reducer //OR job.setNumReduceTasks(0); // logger.info("number of reduce task is " + job.getNumReduceTasks()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); logger.info("output key class is " + job.getOutputKeyClass()); logger.info("output value class is " + job.getOutputValueClass()); job.setInputFormatClass(TextInputFormat.class); logger.info("input format class is " + job.getInputFormatClass()); job.setOutputFormatClass(TextOutputFormat.class); logger.info("output format class is " + job.getOutputFormatClass()); Path filePath = new Path(args[0]); logger.info("input path " + filePath); FileInputFormat.setInputPaths(job, filePath); Path outputPath = new Path(args[1]); logger.info("output path " + outputPath); FileOutputFormat.setOutputPath(job, outputPath); job.waitForCompletion(true); return 0; }
From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java
License:Apache License
/** * Configures the job with any Avro reader or writer schemas specified by the mapper class. * * <p>If the job's mapper class uses AvroKey as the job's input key class, it should * have implemented the AvroKeyReader interface to specify the reader schema for the * input key. Likewise, if it uses AvroValue as the job's input value class, it should * have implemented the AvroValueReader interface.</p> * * <p>If the job's mapper class uses AvroKey as the output key class, it should * have implemented the AvroKeyWriter interface to specify the writer schema for the * output key. Likewise, if it uses AvroValue as the output value class, it should have * implemented the AvroValueWriter interface.</p> * * <p>This method makes sure those interfaces were implemented correctly, uses them to * fetch the reader/writer schemas as necessary, and sets them in the Job configuration * so the Avro input format and serialization framework can access them.</p> * * @param job The job to configure.//from ww w . j a va 2 s.c om * @param mapper The Fiji mapper the job is configured to run. * @throws IOException If the Avro schemas cannot be configured. */ protected void configureAvro(Job job, FijiMapper<?, ?, ?, ?> mapper) throws IOException { // If the user has specified particular reader schemas for the records of the input, // put it in the job configuration. Schema inputKeyReaderSchema = AvroMapReduce.getAvroKeyReaderSchema(mapper); if (null != inputKeyReaderSchema) { LOG.info("Setting reader schema for the map input key to: " + inputKeyReaderSchema); AvroJob.setInputKeySchema(job, inputKeyReaderSchema); } Schema inputValueReaderSchema = AvroMapReduce.getAvroValueReaderSchema(mapper); if (null != inputValueReaderSchema) { LOG.info("Setting reader schema for the map input value to: " + inputValueReaderSchema); AvroJob.setInputValueSchema(job, inputValueReaderSchema); } // Set the output writer schemas in the job configuration (if specified). Schema outputKeyWriterSchema = AvroMapReduce.getAvroKeyWriterSchema(mapper); if (null != outputKeyWriterSchema) { if (!AvroKey.class.isAssignableFrom(job.getMapOutputKeyClass())) { throw new JobConfigurationException( mapper.getClass().getName() + ".getAvroKeyWriterSchema() returned a non-null Schema" + " but the output key class was not AvroKey."); } LOG.info("Setting avro serialization for map output key schema: " + outputKeyWriterSchema); AvroJob.setMapOutputKeySchema(job, outputKeyWriterSchema); } Schema outputValueWriterSchema = AvroMapReduce.getAvroValueWriterSchema(mapper); if (null != outputValueWriterSchema) { if (!AvroValue.class.isAssignableFrom(job.getMapOutputValueClass())) { throw new JobConfigurationException( mapper.getClass().getName() + ".getAvroValueWriterSchema() returned a non-null Schema" + " but the output value class was not AvroValue."); } LOG.info("Setting avro serialization for map output value schema: " + outputValueWriterSchema); AvroJob.setMapOutputValueSchema(job, outputValueWriterSchema); } }
From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java
License:Apache License
/** * Configures the MapReduce reducer for the job. * * @param job The Hadoop MR job.//from w w w . j a v a2 s . c o m * @throws IOException If there is an error. */ protected void configureReducer(Job job) throws IOException { final FijiReducer<?, ?, ?, ?> reducer = getReducer(); if (null == reducer) { LOG.info("No reducer provided. This will be a map-only job"); job.setNumReduceTasks(0); // Set the job output key/value classes based on what the map output key/value classes were // since this a map-only job. job.setOutputKeyClass(job.getMapOutputKeyClass()); Schema mapOutputKeySchema = AvroJob.getMapOutputKeySchema(job.getConfiguration()); if (null != mapOutputKeySchema) { AvroJob.setOutputKeySchema(job, mapOutputKeySchema); } job.setOutputValueClass(job.getMapOutputValueClass()); Schema mapOutputValueSchema = AvroJob.getMapOutputValueSchema(job.getConfiguration()); if (null != mapOutputValueSchema) { AvroJob.setOutputValueSchema(job, mapOutputValueSchema); } return; } if (reducer instanceof Configurable) { ((Configurable) reducer).setConf(job.getConfiguration()); } job.setReducerClass(reducer.getClass()); // Set output key class. Class<?> outputKeyClass = reducer.getOutputKeyClass(); job.setOutputKeyClass(outputKeyClass); Schema outputKeyWriterSchema = AvroMapReduce.getAvroKeyWriterSchema(reducer); if (AvroKey.class.isAssignableFrom(outputKeyClass)) { if (null == outputKeyWriterSchema) { throw new JobConfigurationException("Using AvroKey output, but a writer schema was not provided. " + "Did you forget to implement AvroKeyWriter in your FijiReducer?"); } AvroJob.setOutputKeySchema(job, outputKeyWriterSchema); } else if (null != outputKeyWriterSchema) { throw new JobConfigurationException( reducer.getClass().getName() + ".getAvroKeyWriterSchema() returned a non-null Schema" + " but the output key class was not AvroKey."); } // Set output value class. Class<?> outputValueClass = reducer.getOutputValueClass(); job.setOutputValueClass(outputValueClass); Schema outputValueWriterSchema = AvroMapReduce.getAvroValueWriterSchema(reducer); if (AvroValue.class.isAssignableFrom(outputValueClass)) { if (null == outputValueWriterSchema) { throw new JobConfigurationException("Using AvroValue output, but a writer schema was not provided. " + "Did you forget to implement AvroValueWriter in your FijiReducer?"); } AvroJob.setOutputValueSchema(job, outputValueWriterSchema); } else if (null != outputValueWriterSchema) { throw new JobConfigurationException( reducer.getClass().getName() + ".getAvroValueWriterSchema() returned a non-null Schema" + " but the output value class was not AvroValue."); } }
From source file:com.savy3.nonequijoin.MapOutputSampler.java
License:Apache License
/** * Driver for InputSampler MapReduce Job *//* www.ja v a2 s. c o m*/ public static void runMap(Job job, Path sampleInputPath) throws IOException, IllegalStateException, ClassNotFoundException, InterruptedException { LOG.info("Running a MapReduce Job on Sample Input File" + sampleInputPath.toString()); Configuration conf = new Configuration(); conf.setBoolean("mapreduce.job.ubertask.enable", true); conf.set("numSamples", "" + (job.getNumReduceTasks() - 1)); Job sampleJob = new Job(conf); sampleJob.setMapperClass(job.getMapperClass()); sampleJob.setReducerClass(SampleKeyReducer.class); sampleJob.setJarByClass(job.getMapperClass()); sampleJob.setMapOutputKeyClass(job.getMapOutputKeyClass()); sampleJob.setMapOutputValueClass(job.getMapOutputValueClass()); sampleJob.setOutputKeyClass(job.getMapOutputKeyClass()); sampleJob.setOutputValueClass(NullWritable.class); sampleJob.setInputFormatClass(SequenceFileInputFormat.class); sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileInputFormat.addInputPath(sampleJob, sampleInputPath); FileSystem fs = FileSystem.get(conf); Path out = new Path(sampleInputPath.getParent(), "mapOut"); fs.delete(out, true); SequenceFileOutputFormat.setOutputPath(sampleJob, out); sampleJob.waitForCompletion(true); LOG.info("Sample MapReduce Job Output File" + out.toString()); Path partFile = new Path(out, "part-r-00000"); Path tmpFile = new Path("/_tmp"); fs.delete(tmpFile, true); fs.rename(partFile, tmpFile); fs.delete(sampleInputPath.getParent(), true); fs.rename(new Path("/_tmp"), sampleInputPath.getParent()); LOG.info("Sample partitioning file cpied to location " + sampleInputPath.getParent().toString()); }
From source file:com.splicemachine.mrio.api.SpliceTableMapReduceUtil.java
License:Apache License
/** * Add the dependency jars as well as jars for any of the configured * job classes to the job configuration, so that JobClient will ship them * to the cluster and add them to the DistributedCache. *//*from w w w .j a v a2 s. c o m*/ public static void addDependencyJars(Job job) throws IOException { try { addDependencyJars(job.getConfiguration(), org.apache.zookeeper.ZooKeeper.class, com.google.protobuf.Message.class, com.google.common.collect.ImmutableSet.class, job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getInputFormatClass(), job.getOutputKeyClass(), job.getOutputValueClass(), job.getOutputFormatClass(), job.getPartitionerClass(), job.getCombinerClass()); } catch (ClassNotFoundException e) { throw new IOException(e); } }