Example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass)

Source Link

Document

Set the key class for the map output data.

Usage

From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopV2JobSelfTest.java

License:Apache License

/**
 * Tests that {@link GridHadoopJob} provides wrapped serializer if it's set in configuration.
 *
 * @throws IgniteCheckedException If fails.
 *///from   w w  w .  java 2s .co  m
public void testCustomSerializationApplying() throws IgniteCheckedException {
    JobConf cfg = new JobConf();

    cfg.setMapOutputKeyClass(IntWritable.class);
    cfg.setMapOutputValueClass(Text.class);
    cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

    GridHadoopJob job = new GridHadoopV2Job(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(cfg), log);

    GridHadoopTaskContext taskCtx = job
            .getTaskContext(new GridHadoopTaskInfo(GridHadoopTaskType.MAP, null, 0, 0, null));

    GridHadoopSerialization ser = taskCtx.keySerialization();

    assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0]));

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());

    ser = taskCtx.valueSerialization();

    assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());
}

From source file:org.apache.ignite.internal.processors.hadoop.HadoopV2JobSelfTest.java

License:Apache License

/**
 * Tests that {@link HadoopJob} provides wrapped serializer if it's set in configuration.
 *
 * @throws IgniteCheckedException If fails.
 *//*w w w  .  j  a v a2s  . c o  m*/
public void testCustomSerializationApplying() throws IgniteCheckedException {
    JobConf cfg = new JobConf();

    cfg.setMapOutputKeyClass(IntWritable.class);
    cfg.setMapOutputValueClass(Text.class);
    cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

    HadoopJob job = new HadoopV2Job(new HadoopJobId(UUID.randomUUID(), 1), createJobInfo(cfg), log);

    HadoopTaskContext taskCtx = job.getTaskContext(new HadoopTaskInfo(HadoopTaskType.MAP, null, 0, 0, null));

    HadoopSerialization ser = taskCtx.keySerialization();

    assertEquals(HadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0]));

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());

    ser = taskCtx.valueSerialization();

    assertEquals(HadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());
}

From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopV2JobSelfTest.java

License:Apache License

/**
 * Tests that {@link HadoopJobEx} provides wrapped serializer if it's set in configuration.
 *
 * @throws IgniteCheckedException If fails.
 *//*from w w  w.ja  v  a2s.  co m*/
public void testCustomSerializationApplying() throws IgniteCheckedException {
    JobConf cfg = new JobConf();

    cfg.setMapOutputKeyClass(IntWritable.class);
    cfg.setMapOutputValueClass(Text.class);
    cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

    HadoopDefaultJobInfo info = createJobInfo(cfg);

    final UUID uuid = UUID.randomUUID();

    HadoopJobId id = new HadoopJobId(uuid, 1);

    HadoopJobEx job = info.createJob(HadoopV2Job.class, id, log, null, new HadoopHelperImpl());

    HadoopTaskContext taskCtx = job.getTaskContext(new HadoopTaskInfo(HadoopTaskType.MAP, null, 0, 0, null));

    HadoopSerialization ser = taskCtx.keySerialization();

    assertEquals(HadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0]));

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());

    ser = taskCtx.valueSerialization();

    assertEquals(HadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());
}

From source file:org.apache.mahout.math.hadoop.MatrixMultiplicationJob.java

License:Apache License

public static Configuration createMatrixMultiplyJobConf(Configuration initialConf, Path aPath, Path bPath,
        Path outPath, int outCardinality) {
    JobConf conf = new JobConf(initialConf, MatrixMultiplicationJob.class);
    conf.setInputFormat(CompositeInputFormat.class);
    conf.set("mapred.join.expr",
            CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, aPath, bPath));
    conf.setInt(OUT_CARD, outCardinality);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClass(MatrixMultiplyMapper.class);
    conf.setCombinerClass(MatrixMultiplicationReducer.class);
    conf.setReducerClass(MatrixMultiplicationReducer.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(VectorWritable.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(VectorWritable.class);
    return conf;//from  w w  w  .j  a va2  s . c  o  m
}

From source file:org.apache.nutch.indexer.DeleteDuplicates.java

License:Apache License

public void dedup(Path[] indexDirs) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("Dedup: starting");
    }//from w  w w. jav  a2  s  . com

    Path outDir1 = new Path("dedup-urls-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(getConf());

    for (int i = 0; i < indexDirs.length; i++) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Dedup: adding indexes in: " + indexDirs[i]);
        }
        job.addInputPath(indexDirs[i]);
    }
    job.setJobName("dedup 1: urls by time");

    job.setInputFormat(InputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IndexDoc.class);

    job.setReducerClass(UrlsReducer.class);
    job.setOutputPath(outDir1);

    job.setOutputKeyClass(MD5Hash.class);
    job.setOutputValueClass(IndexDoc.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient.runJob(job);

    Path outDir2 = new Path("dedup-hash-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    job = new NutchJob(getConf());
    job.setJobName("dedup 2: content by hash");

    job.addInputPath(outDir1);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(MD5Hash.class);
    job.setMapOutputValueClass(IndexDoc.class);
    job.setPartitionerClass(HashPartitioner.class);
    job.setSpeculativeExecution(false);

    job.setReducerClass(HashReducer.class);
    job.setOutputPath(outDir2);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IndexDoc.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient.runJob(job);

    // remove outDir1 - no longer needed
    fs.delete(outDir1);

    job = new NutchJob(getConf());
    job.setJobName("dedup 3: delete from index(es)");

    job.addInputPath(outDir2);
    job.setInputFormat(SequenceFileInputFormat.class);
    //job.setInputKeyClass(Text.class);
    //job.setInputValueClass(IndexDoc.class);

    job.setInt("io.file.buffer.size", 4096);
    job.setMapperClass(DeleteDuplicates.class);
    job.setReducerClass(DeleteDuplicates.class);

    job.setOutputFormat(DeleteDuplicates.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    JobClient.runJob(job);

    fs.delete(outDir2);

    if (LOG.isInfoEnabled()) {
        LOG.info("Dedup: done");
    }
}

From source file:org.apache.nutch.indexer.field.AnchorFields.java

License:Apache License

/**
 * Runs the Extractor job.  Get outlinks to be converted while ignoring empty
 * and null anchors.//www . java  2s . c  o  m
 * 
 * @param webGraphDb The WebGraphDb to pull from.
 * @param output The extractor output.
 * 
 * @throws IOException If an error occurs while running the extractor.
 */
private void runExtractor(Path webGraphDb, Path output) throws IOException {

    JobConf extractor = new NutchJob(getConf());
    extractor.setJobName("AnchorFields Extractor");
    FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.OUTLINK_DIR));
    FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.NODE_DIR));
    FileOutputFormat.setOutputPath(extractor, output);
    extractor.setInputFormat(SequenceFileInputFormat.class);
    extractor.setMapperClass(Extractor.class);
    extractor.setReducerClass(Extractor.class);
    extractor.setMapOutputKeyClass(Text.class);
    extractor.setMapOutputValueClass(ObjectWritable.class);
    extractor.setOutputKeyClass(Text.class);
    extractor.setOutputValueClass(LinkDatum.class);
    extractor.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting extractor job");
    try {
        JobClient.runJob(extractor);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished extractor job.");
}

From source file:org.apache.nutch.indexer.field.AnchorFields.java

License:Apache License

/**
 * Runs the collector job.  Aggregates extracted inlinks, sorts and converts
 * the highest scoring into FieldWritable objects.  Only inlinks for which
 * basic fields exist will be collected to avoid orphan fields.
 * /*  w  w  w.j av  a2 s .c  o m*/
 * @param basicFields The BasicFields which must be present to collect anchors
 * to avoid orphan fields.
 * @param links The outlinks path.
 * @param output The collector output.
 * 
 * @throws IOException If an error occurs while running the collector.
 */
private void runCollector(Path basicFields, Path links, Path output) throws IOException {

    JobConf collector = new NutchJob(getConf());
    collector.setJobName("AnchorFields Collector");
    FileInputFormat.addInputPath(collector, links);
    FileInputFormat.addInputPath(collector, basicFields);
    FileOutputFormat.setOutputPath(collector, output);
    collector.setInputFormat(SequenceFileInputFormat.class);
    collector.setMapOutputKeyClass(Text.class);
    collector.setMapOutputValueClass(ObjectWritable.class);
    collector.setMapperClass(Collector.class);
    collector.setReducerClass(Collector.class);
    collector.setOutputKeyClass(Text.class);
    collector.setOutputValueClass(FieldWritable.class);
    collector.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting collector job");
    try {
        JobClient.runJob(collector);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished collector job.");
}

From source file:org.apache.nutch.indexer.field.BasicFields.java

License:Apache License

/**
 * Runs the Extractor job. Extracts basic fields from segments.
 * /*from  w ww.j av a2 s  .c o  m*/
 * @param nodeDb The node database
 * @param segment A single segment to process.
 * @param outputDir The extractor output.
 * 
 * @throws IOException If an error occurs while processing the segment.
 */
private void runExtractor(Path nodeDb, Path segment, Path outputDir) throws IOException {

    LOG.info("BasicFields: starting extractor");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);

    LOG.info("BasicFields: extractor adding segment: " + segment);
    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
    FileInputFormat.addInputPath(job, nodeDb);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Extractor.class);
    job.setReducerClass(Extractor.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
        LOG.info("BasicFields: finished extractor");
    }
}

From source file:org.apache.nutch.indexer.field.BasicFields.java

License:Apache License

/**
 * Runs the Flipper job. Flipper is the first of a two part job to implement
 * redirect logic./*from   w  w  w.  j a va2 s. com*/
 * 
 * @param basicFields The basic fields temporary output.
 * @param nodeDb The node database.
 * @param outputDir The flipper output.
 * 
 * @throws IOException If an error occurs while processing.
 */
private void runFlipper(Path basicFields, Path nodeDb, Path outputDir) throws IOException {

    LOG.info("BasicFields: starting flipper");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    FileInputFormat.addInputPath(job, nodeDb);
    FileInputFormat.addInputPath(job, basicFields);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Flipper.class);
    job.setReducerClass(Flipper.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LinkDatum.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
        LOG.info("BasicFields: finished flipper");
    }
}

From source file:org.apache.nutch.indexer.field.BasicFields.java

License:Apache License

/**
 * Runs the Scorer job. Scorer is the second of a two part job to implement
 * redirect logic.//from w  w w  .j av a  2  s  . c o m
 * 
 * @param basicFields The basic fields temporary output.
 * @param links The temporary output holding urls and any redirects.
 * @param outputDir The scorer output.
 * 
 * @throws IOException If an error occurs while processing.
 */
private void runScorer(Path basicFields, Path links, Path outputDir) throws IOException {

    LOG.info("BasicFields: starting scorer");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    FileInputFormat.addInputPath(job, links);
    FileInputFormat.addInputPath(job, basicFields);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Scorer.class);
    job.setReducerClass(Scorer.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
        LOG.info("BasicFields: finished scorer");
    }
}