Example usage for org.apache.hadoop.mapreduce Job getMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getMapOutputKeyClass.

Prototype

public Class<?> getMapOutputKeyClass()

Source Link

Document

Get the key class for the map output data.

Usage

From source file:ComRoughSetApproInputSampler.java

License:Apache License

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link TotalOrderPartitioner#getPartitionFile}.
 *//*w  w w .ja v a 2  s .  c o  m*/
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler)
        throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = job.getConfiguration();
    final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    int numPartitions = job.getNumReduceTasks();
    K[] samples = (K[]) sampler.getSample(inf, job);
    LOG.info("Using " + samples.length + " samples");
    RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
    Arrays.sort(samples, comparator);
    Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
    FileSystem fs = dst.getFileSystem(conf);
    if (fs.exists(dst)) {
        fs.delete(dst, false);
    }
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(),
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    float stepSize = samples.length / (float) numPartitions;
    int last = -1;
    for (int i = 1; i < numPartitions; ++i) {
        int k = Math.round(stepSize * i);
        while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
            ++k;
        }
        writer.append(samples[k], nullValue);
        last = k;
    }
    writer.close();
}

From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java

License:Apache License

private void runJob(Job job) throws ClassNotFoundException, IOException, InterruptedException {
    assert job.getJobID() != null;
    TaskID taskId = newMapTaskId(job.getJobID(), 0);
    Configuration conf = job.getConfiguration();
    OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
    OutputCommitter committer = output/* ww  w  . j a v  a  2 s .  c om*/
            .getOutputCommitter(newTaskAttemptContext(conf, newTaskAttemptId(taskId, 0)));
    boolean succeed = false;
    committer.setupJob(job);
    try {
        if (job.getNumReduceTasks() == 0) {
            runMap(job, null);
        } else {
            try (KeyValueSorter<?, ?> sorter = createSorter(job, job.getMapOutputKeyClass(),
                    job.getMapOutputValueClass())) {
                runMap(job, sorter);
                runReduce(job, sorter);
            }
        }
        committer.commitJob(job);
        succeed = true;
    } finally {
        if (succeed == false) {
            try {
                committer.abortJob(job, State.FAILED);
            } catch (IOException e) {
                LOG.error(MessageFormat.format("error occurred while aborting job: {0} ({1})", job.getJobID(),
                        job.getJobName()), e);
            }
        }
    }
}

From source file:com.baynote.kafka.hadoop.KafkaJobBuilderTest.java

License:Apache License

@Test
public void testConfigureWholeJob() throws Exception {
    // base configuration
    builder.setZkConnect("localhost:2181");
    builder.addQueueInput("queue_name", "group_name", MockMapper.class);
    builder.setTextFileOutputFormat("/a/hdfs/path");

    // extended configuration
    builder.setJobName("job_name");
    builder.setMapOutputKeyClass(Text.class);
    builder.setMapOutputValueClass(BytesWritable.class);
    builder.setReducerClass(MockReducer.class);
    builder.setTaskMemorySettings("-Xmx2048m");
    builder.setNumReduceTasks(100);// ww  w.j  a v  a2  s  .c  o  m
    builder.setParitioner(MockPartitioner.class);
    builder.setKafkaFetchSizeBytes(1024);

    Job job = builder.configureJob(conf);

    assertEquals("job_name", job.getJobName());
    assertEquals(Text.class, job.getMapOutputKeyClass());
    assertEquals(BytesWritable.class, job.getMapOutputValueClass());
    assertEquals(MockReducer.class, job.getReducerClass());
    assertEquals(MockMapper.class, job.getMapperClass());
    assertEquals("-Xmx2048m", job.getConfiguration().get("mapred.child.java.opts"));
    assertEquals(100, job.getNumReduceTasks());
    assertEquals(MockPartitioner.class, job.getPartitionerClass());
    assertEquals(1024, KafkaInputFormat.getKafkaFetchSizeBytes(job.getConfiguration()));
    assertEquals(TextOutputFormat.class, job.getOutputFormatClass());
    assertEquals(KafkaInputFormat.class, job.getInputFormatClass());
    assertEquals("file:/a/hdfs/path", TextOutputFormat.getOutputPath(job).toString());

    builder.setJobName(null);
    builder.setSequenceFileOutputFormat();
    builder.setUseLazyOutput();
    builder.addQueueInput("queue_name_2", "group_name_2", MockMapper.class);

    job = builder.configureJob(conf);
    assertEquals(LazyOutputFormat.class, job.getOutputFormatClass());
    assertEquals(MultipleKafkaInputFormat.class, job.getInputFormatClass());
    assertEquals(DelegatingMapper.class, job.getMapperClass());
    assertEquals(BytesWritable.class, job.getOutputKeyClass());
    assertEquals(BytesWritable.class, job.getOutputValueClass());
    assertNotNull(SequenceFileOutputFormat.getOutputPath(job));
    assertNotNull(job.getJobName());

    // use s3
    builder.useS3("my_aws_key", "s3cr3t", "my-bucket");
    builder.setTextFileOutputFormat("/a/hdfs/path");
    job = builder.configureJob(conf);

    assertEquals("my_aws_key", job.getConfiguration().get("fs.s3n.awsAccessKeyId"));
    assertEquals("s3cr3t", job.getConfiguration().get("fs.s3n.awsSecretAccessKey"));
    assertEquals("my_aws_key", job.getConfiguration().get("fs.s3.awsAccessKeyId"));
    assertEquals("s3cr3t", job.getConfiguration().get("fs.s3.awsSecretAccessKey"));
}

From source file:com.cloudera.castagna.logparser.Utils.java

License:Apache License

public static void log(Job job, Logger log) throws ClassNotFoundException {
    log.debug("{} -> {} ({}, {}) -> {}#{} ({}, {}) -> {}",
            new Object[] { job.getInputFormatClass().getSimpleName(), job.getMapperClass().getSimpleName(),
                    job.getMapOutputKeyClass().getSimpleName(), job.getMapOutputValueClass().getSimpleName(),
                    job.getReducerClass().getSimpleName(), job.getNumReduceTasks(),
                    job.getOutputKeyClass().getSimpleName(), job.getOutputValueClass().getSimpleName(),
                    job.getOutputFormatClass().getSimpleName() });
    Path[] inputs = FileInputFormat.getInputPaths(job);
    Path output = FileOutputFormat.getOutputPath(job);
    log.debug("input: {}", inputs[0]);
    log.debug("output: {}", output);
}

From source file:com.cloudera.spark.bulkload.TotalOrderPartitioner.java

License:Apache License

/**
   * Read in the partition file and build indexing data structures.
   * If the keytype is {@link BinaryComparable} and
   * <tt>total.order.partitioner.natural.order</tt> is not false, a trie
   * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
   * will be built. Otherwise, keys will be located using a binary search of
   * the partition keyset using the {@link RawComparator}
   * defined for this job. The input file must be sorted with the same
   * comparator and contain {@link Job#getNumReduceTasks()} - 1 keys.
   */// w w  w.  j a  v  a 2 s  . co  m
  @SuppressWarnings("unchecked") // keytype from conf not static
  public void setConf(Configuration conf) {
      try {
          this.conf = conf;
          String parts = getPartitionFile(conf);
          final Path partFile = new Path(parts);
          final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(conf) // assume in DistributedCache
                  : partFile.getFileSystem(conf);

          Job job = new Job(conf);
          Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
          K[] splitPoints = readPartitions(fs, partFile, keyClass, conf);
          if (splitPoints.length != job.getNumReduceTasks() - 1) {
              throw new IOException("Wrong number of partitions in keyset");
          }
          RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
          for (int i = 0; i < splitPoints.length - 1; ++i) {
              if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) {
                  throw new IOException("Split points are out of order");
              }
          }
          boolean natOrder = conf.getBoolean(NATURAL_ORDER, true);
          if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
              partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0],
                      // Now that blocks of identical splitless trie nodes are 
                      // represented reentrantly, and we develop a leaf for any trie
                      // node with only one split point, the only reason for a depth
                      // limit is to refute stack overflow or bloat in the pathological
                      // case where the split points are long and mostly look like bytes 
                      // iii...iixii...iii   .  Therefore, we make the default depth
                      // limit large but not huge.
                      conf.getInt(MAX_TRIE_DEPTH, 200));
          } else {
              partitions = new BinarySearchNode(splitPoints, comparator);
          }
      } catch (IOException e) {
          throw new IllegalArgumentException("Can't read partitions file", e);
      }
  }

From source file:com.example.Driver.java

License:Open Source License

public int run(String[] args) throws Exception {

    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf, "Your job name");

    job.setJarByClass(Driver.class);

    logger.info("job " + job.getJobName() + " [" + job.getJar() + "] started with the following arguments: "
            + Arrays.toString(args));

    if (args.length < 2) {
        logger.warn("to run this jar are necessary at 2 parameters \"" + job.getJar()
                + " input_files output_directory");
        return 1;
    }// ww w.j a v a  2s . com

    job.setMapperClass(WordcountMapper.class);
    logger.info("mapper class is " + job.getMapperClass());

    //job.setMapOutputKeyClass(Text.class);
    //job.setMapOutputValueClass(IntWritable.class);
    logger.info("mapper output key class is " + job.getMapOutputKeyClass());
    logger.info("mapper output value class is " + job.getMapOutputValueClass());

    job.setReducerClass(WordcountReducer.class);
    logger.info("reducer class is " + job.getReducerClass());
    job.setCombinerClass(WordcountReducer.class);
    logger.info("combiner class is " + job.getCombinerClass());
    //When you are not runnign any Reducer
    //OR    job.setNumReduceTasks(0);
    //      logger.info("number of reduce task is " + job.getNumReduceTasks());

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    logger.info("output key class is " + job.getOutputKeyClass());
    logger.info("output value class is " + job.getOutputValueClass());

    job.setInputFormatClass(TextInputFormat.class);
    logger.info("input format class is " + job.getInputFormatClass());

    job.setOutputFormatClass(TextOutputFormat.class);
    logger.info("output format class is " + job.getOutputFormatClass());

    Path filePath = new Path(args[0]);
    logger.info("input path " + filePath);
    FileInputFormat.setInputPaths(job, filePath);

    Path outputPath = new Path(args[1]);
    logger.info("output path " + outputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.waitForCompletion(true);
    return 0;
}

From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java

License:Apache License

/**
 * Configures the job with any Avro reader or writer schemas specified by the mapper class.
 *
 * <p>If the job's mapper class uses AvroKey as the job's input key class, it should
 * have implemented the AvroKeyReader interface to specify the reader schema for the
 * input key.  Likewise, if it uses AvroValue as the job's input value class, it should
 * have implemented the AvroValueReader interface.</p>
 *
 * <p>If the job's mapper class uses AvroKey as the output key class, it should
 * have implemented the AvroKeyWriter interface to specify the writer schema for the
 * output key.  Likewise, if it uses AvroValue as the output value class, it should have
 * implemented the AvroValueWriter interface.</p>
 *
 * <p>This method makes sure those interfaces were implemented correctly, uses them to
 * fetch the reader/writer schemas as necessary, and sets them in the Job configuration
 * so the Avro input format and serialization framework can access them.</p>
 *
 * @param job The job to configure.//from  ww w  .  j  a  va 2  s.c  om
 * @param mapper The Fiji mapper the job is configured to run.
 * @throws IOException If the Avro schemas cannot be configured.
 */
protected void configureAvro(Job job, FijiMapper<?, ?, ?, ?> mapper) throws IOException {
    // If the user has specified particular reader schemas for the records of the input,
    // put it in the job configuration.
    Schema inputKeyReaderSchema = AvroMapReduce.getAvroKeyReaderSchema(mapper);
    if (null != inputKeyReaderSchema) {
        LOG.info("Setting reader schema for the map input key to: " + inputKeyReaderSchema);
        AvroJob.setInputKeySchema(job, inputKeyReaderSchema);
    }
    Schema inputValueReaderSchema = AvroMapReduce.getAvroValueReaderSchema(mapper);
    if (null != inputValueReaderSchema) {
        LOG.info("Setting reader schema for the map input value to: " + inputValueReaderSchema);
        AvroJob.setInputValueSchema(job, inputValueReaderSchema);
    }

    // Set the output writer schemas in the job configuration (if specified).
    Schema outputKeyWriterSchema = AvroMapReduce.getAvroKeyWriterSchema(mapper);
    if (null != outputKeyWriterSchema) {
        if (!AvroKey.class.isAssignableFrom(job.getMapOutputKeyClass())) {
            throw new JobConfigurationException(
                    mapper.getClass().getName() + ".getAvroKeyWriterSchema() returned a non-null Schema"
                            + " but the output key class was not AvroKey.");
        }
        LOG.info("Setting avro serialization for map output key schema: " + outputKeyWriterSchema);
        AvroJob.setMapOutputKeySchema(job, outputKeyWriterSchema);
    }
    Schema outputValueWriterSchema = AvroMapReduce.getAvroValueWriterSchema(mapper);
    if (null != outputValueWriterSchema) {
        if (!AvroValue.class.isAssignableFrom(job.getMapOutputValueClass())) {
            throw new JobConfigurationException(
                    mapper.getClass().getName() + ".getAvroValueWriterSchema() returned a non-null Schema"
                            + " but the output value class was not AvroValue.");
        }
        LOG.info("Setting avro serialization for map output value schema: " + outputValueWriterSchema);
        AvroJob.setMapOutputValueSchema(job, outputValueWriterSchema);
    }
}

From source file:com.moz.fiji.mapreduce.framework.MapReduceJobBuilder.java

License:Apache License

/**
 * Configures the MapReduce reducer for the job.
 *
 * @param job The Hadoop MR job.//from   w w  w  . j  a  v  a2  s  .  c o m
 * @throws IOException If there is an error.
 */
protected void configureReducer(Job job) throws IOException {
    final FijiReducer<?, ?, ?, ?> reducer = getReducer();
    if (null == reducer) {
        LOG.info("No reducer provided. This will be a map-only job");
        job.setNumReduceTasks(0);

        // Set the job output key/value classes based on what the map output key/value classes were
        // since this a map-only job.
        job.setOutputKeyClass(job.getMapOutputKeyClass());
        Schema mapOutputKeySchema = AvroJob.getMapOutputKeySchema(job.getConfiguration());
        if (null != mapOutputKeySchema) {
            AvroJob.setOutputKeySchema(job, mapOutputKeySchema);
        }
        job.setOutputValueClass(job.getMapOutputValueClass());
        Schema mapOutputValueSchema = AvroJob.getMapOutputValueSchema(job.getConfiguration());
        if (null != mapOutputValueSchema) {
            AvroJob.setOutputValueSchema(job, mapOutputValueSchema);
        }
        return;
    }
    if (reducer instanceof Configurable) {
        ((Configurable) reducer).setConf(job.getConfiguration());
    }
    job.setReducerClass(reducer.getClass());

    // Set output key class.
    Class<?> outputKeyClass = reducer.getOutputKeyClass();
    job.setOutputKeyClass(outputKeyClass);
    Schema outputKeyWriterSchema = AvroMapReduce.getAvroKeyWriterSchema(reducer);
    if (AvroKey.class.isAssignableFrom(outputKeyClass)) {
        if (null == outputKeyWriterSchema) {
            throw new JobConfigurationException("Using AvroKey output, but a writer schema was not provided. "
                    + "Did you forget to implement AvroKeyWriter in your FijiReducer?");
        }
        AvroJob.setOutputKeySchema(job, outputKeyWriterSchema);
    } else if (null != outputKeyWriterSchema) {
        throw new JobConfigurationException(
                reducer.getClass().getName() + ".getAvroKeyWriterSchema() returned a non-null Schema"
                        + " but the output key class was not AvroKey.");
    }

    // Set output value class.
    Class<?> outputValueClass = reducer.getOutputValueClass();
    job.setOutputValueClass(outputValueClass);
    Schema outputValueWriterSchema = AvroMapReduce.getAvroValueWriterSchema(reducer);
    if (AvroValue.class.isAssignableFrom(outputValueClass)) {
        if (null == outputValueWriterSchema) {
            throw new JobConfigurationException("Using AvroValue output, but a writer schema was not provided. "
                    + "Did you forget to implement AvroValueWriter in your FijiReducer?");
        }
        AvroJob.setOutputValueSchema(job, outputValueWriterSchema);
    } else if (null != outputValueWriterSchema) {
        throw new JobConfigurationException(
                reducer.getClass().getName() + ".getAvroValueWriterSchema() returned a non-null Schema"
                        + " but the output value class was not AvroValue.");
    }
}

From source file:com.savy3.nonequijoin.MapOutputSampler.java

License:Apache License

/**
 * Driver for InputSampler MapReduce Job
 *//*  www.ja v  a2 s.  c o m*/
public static void runMap(Job job, Path sampleInputPath)
        throws IOException, IllegalStateException, ClassNotFoundException, InterruptedException {
    LOG.info("Running a MapReduce Job on Sample Input File" + sampleInputPath.toString());

    Configuration conf = new Configuration();
    conf.setBoolean("mapreduce.job.ubertask.enable", true);
    conf.set("numSamples", "" + (job.getNumReduceTasks() - 1));
    Job sampleJob = new Job(conf);
    sampleJob.setMapperClass(job.getMapperClass());
    sampleJob.setReducerClass(SampleKeyReducer.class);
    sampleJob.setJarByClass(job.getMapperClass());
    sampleJob.setMapOutputKeyClass(job.getMapOutputKeyClass());
    sampleJob.setMapOutputValueClass(job.getMapOutputValueClass());
    sampleJob.setOutputKeyClass(job.getMapOutputKeyClass());
    sampleJob.setOutputValueClass(NullWritable.class);
    sampleJob.setInputFormatClass(SequenceFileInputFormat.class);
    sampleJob.setOutputFormatClass(SequenceFileOutputFormat.class);

    SequenceFileInputFormat.addInputPath(sampleJob, sampleInputPath);
    FileSystem fs = FileSystem.get(conf);

    Path out = new Path(sampleInputPath.getParent(), "mapOut");
    fs.delete(out, true);

    SequenceFileOutputFormat.setOutputPath(sampleJob, out);

    sampleJob.waitForCompletion(true);

    LOG.info("Sample MapReduce Job Output File" + out.toString());

    Path partFile = new Path(out, "part-r-00000");
    Path tmpFile = new Path("/_tmp");
    fs.delete(tmpFile, true);
    fs.rename(partFile, tmpFile);
    fs.delete(sampleInputPath.getParent(), true);
    fs.rename(new Path("/_tmp"), sampleInputPath.getParent());

    LOG.info("Sample partitioning file cpied to location " + sampleInputPath.getParent().toString());
}

From source file:com.splicemachine.mrio.api.SpliceTableMapReduceUtil.java

License:Apache License

/**
 * Add the dependency jars as well as jars for any of the configured
 * job classes to the job configuration, so that JobClient will ship them
 * to the cluster and add them to the DistributedCache.
 *//*from w w  w  .j a v  a2 s. c o m*/
public static void addDependencyJars(Job job) throws IOException {
    try {
        addDependencyJars(job.getConfiguration(), org.apache.zookeeper.ZooKeeper.class,
                com.google.protobuf.Message.class, com.google.common.collect.ImmutableSet.class,
                job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getInputFormatClass(),
                job.getOutputKeyClass(), job.getOutputValueClass(), job.getOutputFormatClass(),
                job.getPartitionerClass(), job.getCombinerClass());
    } catch (ClassNotFoundException e) {
        throw new IOException(e);
    }
}