Example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopV2JobSelfTest.java

License:Apache License

/**
 * Tests that {@link GridHadoopJob} provides wrapped serializer if it's set in configuration.
 *
 * @throws IgniteCheckedException If fails.
 *///from   w w  w .  java 2s .co  m
public void testCustomSerializationApplying() throws IgniteCheckedException {
    JobConf cfg = new JobConf();

    cfg.setMapOutputKeyClass(IntWritable.class);
    cfg.setMapOutputValueClass(Text.class);
    cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

    GridHadoopJob job = new GridHadoopV2Job(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(cfg), log);

    GridHadoopTaskContext taskCtx = job
            .getTaskContext(new GridHadoopTaskInfo(GridHadoopTaskType.MAP, null, 0, 0, null));

    GridHadoopSerialization ser = taskCtx.keySerialization();

    assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0]));

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());

    ser = taskCtx.valueSerialization();

    assertEquals(GridHadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());
}

From source file:org.apache.ignite.internal.processors.hadoop.HadoopV2JobSelfTest.java

License:Apache License

/**
 * Tests that {@link HadoopJob} provides wrapped serializer if it's set in configuration.
 *
 * @throws IgniteCheckedException If fails.
 *//*w w w  .  j  a v a2s  . c o  m*/
public void testCustomSerializationApplying() throws IgniteCheckedException {
    JobConf cfg = new JobConf();

    cfg.setMapOutputKeyClass(IntWritable.class);
    cfg.setMapOutputValueClass(Text.class);
    cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

    HadoopJob job = new HadoopV2Job(new HadoopJobId(UUID.randomUUID(), 1), createJobInfo(cfg), log);

    HadoopTaskContext taskCtx = job.getTaskContext(new HadoopTaskInfo(HadoopTaskType.MAP, null, 0, 0, null));

    HadoopSerialization ser = taskCtx.keySerialization();

    assertEquals(HadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0]));

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());

    ser = taskCtx.valueSerialization();

    assertEquals(HadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());
}

From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopV2JobSelfTest.java

License:Apache License

/**
 * Tests that {@link HadoopJobEx} provides wrapped serializer if it's set in configuration.
 *
 * @throws IgniteCheckedException If fails.
 *//*from w w  w.ja  v  a2s.  co m*/
public void testCustomSerializationApplying() throws IgniteCheckedException {
    JobConf cfg = new JobConf();

    cfg.setMapOutputKeyClass(IntWritable.class);
    cfg.setMapOutputValueClass(Text.class);
    cfg.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

    HadoopDefaultJobInfo info = createJobInfo(cfg);

    final UUID uuid = UUID.randomUUID();

    HadoopJobId id = new HadoopJobId(uuid, 1);

    HadoopJobEx job = info.createJob(HadoopV2Job.class, id, log, null, new HadoopHelperImpl());

    HadoopTaskContext taskCtx = job.getTaskContext(new HadoopTaskInfo(HadoopTaskType.MAP, null, 0, 0, null));

    HadoopSerialization ser = taskCtx.keySerialization();

    assertEquals(HadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    DataInput in = new DataInputStream(new ByteArrayInputStream(new byte[0]));

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());

    ser = taskCtx.valueSerialization();

    assertEquals(HadoopSerializationWrapper.class.getName(), ser.getClass().getName());

    assertEquals(TEST_SERIALIZED_VALUE, ser.read(in, null).toString());
}

From source file:org.apache.mahout.math.hadoop.MatrixMultiplicationJob.java

License:Apache License

public static Configuration createMatrixMultiplyJobConf(Configuration initialConf, Path aPath, Path bPath,
        Path outPath, int outCardinality) {
    JobConf conf = new JobConf(initialConf, MatrixMultiplicationJob.class);
    conf.setInputFormat(CompositeInputFormat.class);
    conf.set("mapred.join.expr",
            CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, aPath, bPath));
    conf.setInt(OUT_CARD, outCardinality);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(conf, outPath);
    conf.setMapperClass(MatrixMultiplyMapper.class);
    conf.setCombinerClass(MatrixMultiplicationReducer.class);
    conf.setReducerClass(MatrixMultiplicationReducer.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(VectorWritable.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(VectorWritable.class);
    return conf;//from  w w  w  .j  a va2  s . c  o  m
}

From source file:org.apache.nutch.indexer.DeleteDuplicates.java

License:Apache License

public void dedup(Path[] indexDirs) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("Dedup: starting");
    }//from w  w w. jav  a2  s  . com

    Path outDir1 = new Path("dedup-urls-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(getConf());

    for (int i = 0; i < indexDirs.length; i++) {
        if (LOG.isInfoEnabled()) {
            LOG.info("Dedup: adding indexes in: " + indexDirs[i]);
        }
        job.addInputPath(indexDirs[i]);
    }
    job.setJobName("dedup 1: urls by time");

    job.setInputFormat(InputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IndexDoc.class);

    job.setReducerClass(UrlsReducer.class);
    job.setOutputPath(outDir1);

    job.setOutputKeyClass(MD5Hash.class);
    job.setOutputValueClass(IndexDoc.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient.runJob(job);

    Path outDir2 = new Path("dedup-hash-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
    job = new NutchJob(getConf());
    job.setJobName("dedup 2: content by hash");

    job.addInputPath(outDir1);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(MD5Hash.class);
    job.setMapOutputValueClass(IndexDoc.class);
    job.setPartitionerClass(HashPartitioner.class);
    job.setSpeculativeExecution(false);

    job.setReducerClass(HashReducer.class);
    job.setOutputPath(outDir2);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IndexDoc.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    JobClient.runJob(job);

    // remove outDir1 - no longer needed
    fs.delete(outDir1);

    job = new NutchJob(getConf());
    job.setJobName("dedup 3: delete from index(es)");

    job.addInputPath(outDir2);
    job.setInputFormat(SequenceFileInputFormat.class);
    //job.setInputKeyClass(Text.class);
    //job.setInputValueClass(IndexDoc.class);

    job.setInt("io.file.buffer.size", 4096);
    job.setMapperClass(DeleteDuplicates.class);
    job.setReducerClass(DeleteDuplicates.class);

    job.setOutputFormat(DeleteDuplicates.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    JobClient.runJob(job);

    fs.delete(outDir2);

    if (LOG.isInfoEnabled()) {
        LOG.info("Dedup: done");
    }
}

From source file:org.apache.nutch.indexer.field.AnchorFields.java

License:Apache License

/**
 * Runs the Extractor job.  Get outlinks to be converted while ignoring empty
 * and null anchors.//www . java  2s . c  o  m
 * 
 * @param webGraphDb The WebGraphDb to pull from.
 * @param output The extractor output.
 * 
 * @throws IOException If an error occurs while running the extractor.
 */
private void runExtractor(Path webGraphDb, Path output) throws IOException {

    JobConf extractor = new NutchJob(getConf());
    extractor.setJobName("AnchorFields Extractor");
    FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.OUTLINK_DIR));
    FileInputFormat.addInputPath(extractor, new Path(webGraphDb, WebGraph.NODE_DIR));
    FileOutputFormat.setOutputPath(extractor, output);
    extractor.setInputFormat(SequenceFileInputFormat.class);
    extractor.setMapperClass(Extractor.class);
    extractor.setReducerClass(Extractor.class);
    extractor.setMapOutputKeyClass(Text.class);
    extractor.setMapOutputValueClass(ObjectWritable.class);
    extractor.setOutputKeyClass(Text.class);
    extractor.setOutputValueClass(LinkDatum.class);
    extractor.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting extractor job");
    try {
        JobClient.runJob(extractor);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished extractor job.");
}

From source file:org.apache.nutch.indexer.field.AnchorFields.java

License:Apache License

/**
 * Runs the collector job.  Aggregates extracted inlinks, sorts and converts
 * the highest scoring into FieldWritable objects.  Only inlinks for which
 * basic fields exist will be collected to avoid orphan fields.
 * /*  w  w  w.j av  a2 s .c  o m*/
 * @param basicFields The BasicFields which must be present to collect anchors
 * to avoid orphan fields.
 * @param links The outlinks path.
 * @param output The collector output.
 * 
 * @throws IOException If an error occurs while running the collector.
 */
private void runCollector(Path basicFields, Path links, Path output) throws IOException {

    JobConf collector = new NutchJob(getConf());
    collector.setJobName("AnchorFields Collector");
    FileInputFormat.addInputPath(collector, links);
    FileInputFormat.addInputPath(collector, basicFields);
    FileOutputFormat.setOutputPath(collector, output);
    collector.setInputFormat(SequenceFileInputFormat.class);
    collector.setMapOutputKeyClass(Text.class);
    collector.setMapOutputValueClass(ObjectWritable.class);
    collector.setMapperClass(Collector.class);
    collector.setReducerClass(Collector.class);
    collector.setOutputKeyClass(Text.class);
    collector.setOutputValueClass(FieldWritable.class);
    collector.setOutputFormat(SequenceFileOutputFormat.class);

    LOG.info("Starting collector job");
    try {
        JobClient.runJob(collector);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished collector job.");
}

From source file:org.apache.nutch.indexer.field.BasicFields.java

License:Apache License

/**
 * Runs the Extractor job. Extracts basic fields from segments.
 * /*from  w ww.j av a2 s  .c o  m*/
 * @param nodeDb The node database
 * @param segment A single segment to process.
 * @param outputDir The extractor output.
 * 
 * @throws IOException If an error occurs while processing the segment.
 */
private void runExtractor(Path nodeDb, Path segment, Path outputDir) throws IOException {

    LOG.info("BasicFields: starting extractor");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);

    LOG.info("BasicFields: extractor adding segment: " + segment);
    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.FETCH_DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME));
    FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME));
    FileInputFormat.addInputPath(job, nodeDb);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Extractor.class);
    job.setReducerClass(Extractor.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
        LOG.info("BasicFields: finished extractor");
    }
}

From source file:org.apache.nutch.indexer.field.BasicFields.java

License:Apache License

/**
 * Runs the Flipper job. Flipper is the first of a two part job to implement
 * redirect logic./*from   w  w  w.  j a va2 s. com*/
 * 
 * @param basicFields The basic fields temporary output.
 * @param nodeDb The node database.
 * @param outputDir The flipper output.
 * 
 * @throws IOException If an error occurs while processing.
 */
private void runFlipper(Path basicFields, Path nodeDb, Path outputDir) throws IOException {

    LOG.info("BasicFields: starting flipper");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    FileInputFormat.addInputPath(job, nodeDb);
    FileInputFormat.addInputPath(job, basicFields);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Flipper.class);
    job.setReducerClass(Flipper.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LinkDatum.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
        LOG.info("BasicFields: finished flipper");
    }
}

From source file:org.apache.nutch.indexer.field.BasicFields.java

License:Apache License

/**
 * Runs the Scorer job. Scorer is the second of a two part job to implement
 * redirect logic.//from w  w w  .j av a  2  s  . c o m
 * 
 * @param basicFields The basic fields temporary output.
 * @param links The temporary output holding urls and any redirects.
 * @param outputDir The scorer output.
 * 
 * @throws IOException If an error occurs while processing.
 */
private void runScorer(Path basicFields, Path links, Path outputDir) throws IOException {

    LOG.info("BasicFields: starting scorer");
    JobConf job = new NutchJob(getConf());
    job.setJobName("BasicFields " + outputDir);
    FileInputFormat.addInputPath(job, links);
    FileInputFormat.addInputPath(job, basicFields);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(Scorer.class);
    job.setReducerClass(Scorer.class);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FieldsWritable.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
        LOG.info("BasicFields: finished scorer");
    }
}