Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException

Source Link

Document

Set the key class for the map output data.

Usage

From source file:com.twitter.elephanttwin.lucene.indexing.HadoopSplitIndexingJob.java

License:Apache License

/**
 * Override and extend this in implementations to add custom settings to the Job and Conf to
 * create lucene-based indexes that will point you at what splits contain values you are looking for.
 * You are on your own for filtering the splits appropriately before creating an MR job.. but
 * check out how this was done over MapFile-based indexes in
 * com.twitter.elephanttwin.indexing.AbstractIndexesFileInputFormat
 *///from   w w w. jav  a  2  s  .  com
@Override
protected void setupJob(Job job) {
    Configuration conf = job.getConfiguration();
    conf.set("mapred.child.java.opts", "-Xmx4g");
    List<String> fieldNames = Lists.newArrayList();
    for (IndexedField field : getIndexedFields()) {
        fieldNames.add(field.getFieldName());
        conf.set(HadoopSplitIndexingMapper.FIELD_VALUE_EXTRACTOR_PREFIX + field.getFieldName(),
                getExtractorClassName(field.getFieldName()));
    }
    conf.setStrings(HadoopSplitIndexingMapper.FIELDS_TO_INDEX_KEY, fieldNames.toArray(new String[] {}));
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(HadoopSplitDocument.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setInputFormatClass(getInputFormatClass());

    job.setMapperClass(HadoopSplitIndexingMapper.class);
    job.setReducerClass(HadoopSplitIndexingReducer.class);
}

From source file:com.twitter.elephanttwin.lucene.indexing.TextIndexingJob.java

License:Apache License

@Override
protected void setupJob(Job job) {
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setMapperClass(TextIndexingMapper.class);
    job.setReducerClass(TextIndexingReducer.class);
}

From source file:com.twitter.elephanttwin.retrieval.ScanUsingIndexJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    params = new IndexConfig();

    LOG.info(" - input: " + Joiner.on(" ").join(params.getInput()));
    LOG.info(" - output: " + IndexConfig.output.get());

    Configuration conf = getConf();

    Path outputDir = new Path(params.getOutput());
    FileSystem fs = outputDir.getFileSystem(conf);
    fs.delete(outputDir, true);//from   ww w .  j a v a 2s.c  o  m

    int totalInputFiles = 0;
    List<FileStatus> stats = Lists.newArrayList();
    for (String s : params.getInput()) {
        Path spath = new Path(IndexConfig.index.get() + s);
        HdfsUtils.addInputPathRecursively(stats, fs, spath, hiddenDirectoryFilter, indexDataFilter);
    }

    totalInputFiles = stats.size();
    LOG.info(totalInputFiles + " total index files to be scanned");

    conf.set(IndexScanMapper.searchColumnName, params.getColumnName());
    Job job = new Job(new Configuration(conf));
    job.setJarByClass(getClass());
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(params.getOutput()));

    for (FileStatus file : stats)
        FileInputFormat.addInputPath(job, file.getPath());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setNumReduceTasks(1);

    job.setMapperClass(IndexScanMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    job.setJobName("ScanUsingIndexJob:" + IndexConfig.input.get());
    BlockIndexedFileInputFormat.setSearchOptions(job, params.getinputFormat(), params.getValueClass(),
            params.getIndex(), (String) null);
    job.waitForCompletion(true);
    return 0;
}

From source file:com.veera.secondarysort.demo2.SsJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf, "secondary sort");

    job.setJarByClass(SsJob.class);
    job.setPartitionerClass(NaturalKeyPartitioner.class);
    job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
    job.setSortComparatorClass(CompositeKeyComparator.class);

    job.setMapOutputKeyClass(StockKey.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(SsMapper.class);
    job.setReducerClass(SsReducer.class);

    job.waitForCompletion(true);//from  ww  w.  j av  a 2 s. co m

    return 0;
}

From source file:com.wibidata.wibidota.DotaGatherExampleValues.java

License:Apache License

public final int run(final String[] args) throws Exception {
    Job job = new Job(super.getConf(), "Dota Gatherer Example Values");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setMapperClass(EnumGatherMap.class);
    job.setCombinerClass(AppendText.class);
    job.setReducerClass(EnumGatherReducer.class);

    job.setJarByClass(DotaGatherExampleValues.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (job.waitForCompletion(true)) {
        return 0;
    } else {/*  w  w  w.  j ava2  s  .  c  om*/
        return -1;
    }
}

From source file:com.wibidata.wibidota.dotaloader.DotaValuesCounter.java

License:Apache License

public final int run(final String[] args) throws Exception {
    Job job = new Job(super.getConf(), "Dota Value Counter");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(Map.class);
    job.setCombinerClass(Add.class);
    job.setReducerClass(Add.class);

    job.setJarByClass(DotaValuesCounter.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (job.waitForCompletion(true)) {
        return 0;
    } else {//from  ww  w.  j  a v  a 2s  .  com
        return -1;
    }
}

From source file:com.wibidata.wibidota.DotaMaxAccountId.java

License:Apache License

public final int run(final String[] args) throws Exception {
    Job job = new Job(super.getConf(), "Dota Max Builder");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(DotaMaxAccountId.Map.class);
    job.setCombinerClass(DotaMaxAccountId.TakeMax.class);
    job.setReducerClass(DotaMaxAccountId.TakeMax.class);

    job.setJarByClass(DotaMaxAccountId.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (job.waitForCompletion(true)) {
        return 0;
    } else {/*from  ww w. ja  v a  2 s. c om*/
        return -1;
    }
}

From source file:com.wipro.ats.bdre.dq.DQDriver.java

License:Apache License

@Override
public int run(String[] arg) throws Exception {
    String processId = arg[0];//from  www.  ja va 2s. c  o m
    String sPath = arg[1];
    String destDir = arg[2];

    Properties props = new GetProperties().getProperties(processId, "dq");
    LOGGER.debug("props=" + props);
    Configuration conf = getConf();

    conf.set("dq.process.id", processId);
    Job job = Job.getInstance(conf);
    job.setJobName("Data Quality " + processId);
    job.setJarByClass(DQDriver.class);
    job.setMapperClass(DQMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    //Reducer is not required
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);
    Path inputFilePath = new Path(sPath);
    FileInputFormat.addInputPath(job, inputFilePath);
    FileOutputFormat.setOutputPath(job, removeIfExistAndSetOutputPath(conf, destDir));
    MultipleOutputs.addNamedOutput(job, DQConstants.GOOD_RECORDS_FILE, TextOutputFormat.class, Text.class,
            NullWritable.class);
    MultipleOutputs.addNamedOutput(job, DQConstants.BAD_RECORDS_FILE, TextOutputFormat.class, Text.class,
            NullWritable.class);
    MultipleOutputs.addNamedOutput(job, DQConstants.FILE_REPORT_FILE, TextOutputFormat.class, Text.class,
            NullWritable.class);

    if (!job.waitForCompletion(true)) {
        return 1;
    }

    Path outputDir = new Path(destDir);
    FileSystem srcFs = outputDir.getFileSystem(getConf());
    FileSystem destFs = outputDir.getFileSystem(getConf());

    //Valid Records
    Path goodFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_GOOD_RECORD_OUTPUT_DIR);
    //Input and quality filtered file should have same name (but different path)
    Path goodDestFile = new Path(destDir + "/" + inputFilePath.getName());
    if (srcFs.exists(goodFilesSrcDir)) {
        FileUtil.copyMerge(srcFs, goodFilesSrcDir, destFs, goodDestFile, true, conf, "");
    }
    // Invalid Records
    Path badFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_BAD_RECORD_OUTPUT_DIR);
    Path badDestFile = new Path(destDir + "/" + DQConstants.BAD_RECORDS_FILE);
    if (srcFs.exists(badFilesSrcDir)) {
        FileUtil.copyMerge(srcFs, badFilesSrcDir, destFs, badDestFile, true, conf, "");
    }

    // Preparing report aggregation job
    Job fileReportAggregationJob = Job.getInstance(conf);
    fileReportAggregationJob.setJobName("File Report Computing " + processId);
    fileReportAggregationJob.setJarByClass(DQMain.class);

    fileReportAggregationJob.setMapperClass(DQFileReportMapper.class);
    fileReportAggregationJob.setMapOutputKeyClass(Text.class);
    fileReportAggregationJob.setMapOutputValueClass(IntWritable.class);

    fileReportAggregationJob.setReducerClass(DQFileReportReducer.class);
    fileReportAggregationJob.setOutputKeyClass(Text.class);
    fileReportAggregationJob.setOutputValueClass(Text.class);

    fileReportAggregationJob.setNumReduceTasks(1);

    Path fileReportDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_REPORT_OUTPUT_DIR);
    Path fileReportOutputDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER);

    FileInputFormat.addInputPath(fileReportAggregationJob, fileReportDir);
    FileOutputFormat.setOutputPath(fileReportAggregationJob, fileReportOutputDir);

    if (!fileReportAggregationJob.waitForCompletion(true)) {
        return 1;
    }

    // Merge Report Records MR stuffs
    Path reportsSrcDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER);
    Path reportsDestFile = new Path(destDir + "/" + DQConstants.FILE_REPORT_FILE);
    FileUtil.copyMerge(srcFs, reportsSrcDir, destFs, reportsDestFile, true, conf, "");

    Path reportDestFile = new Path(outputDir.toString() + "/" + DQConstants.FILE_REPORT_FILE);
    //Read the report file from HDFS and report the percentage
    DQStats dqStats = getQualityStats(getConf(), reportDestFile);
    LOGGER.info("Percentage of good records :" + dqStats.getGoodPercent());
    props = new GetProperties().getProperties(processId, "dq");
    String strThreshold = props.getProperty("min.pass.threshold.percent");
    float threshold = Float.parseFloat(strThreshold);
    dqStats.setThreshold(threshold);
    //Update the result in metadata
    logResult(dqStats, processId, 0L);
    if (dqStats.getGoodPercent() < threshold) {
        LOGGER.error("DQ check did not pass");
        throw new DQValidationException(dqStats);
    }
    LOGGER.info(dqStats);
    FileChecksum hdfsChecksum = destFs.getFileChecksum(goodDestFile);
    String fileHash = hdfsChecksum == null ? "0" : hdfsChecksum.toString();
    //Return file info oozie params
    RegisterFileInfo registerFileInfo = new RegisterFileInfo();
    registerFileInfo.setBatchId(null);
    registerFileInfo.setCreationTs(new Timestamp(new Date().getTime()));
    registerFileInfo.setFileHash(fileHash);
    registerFileInfo.setFileSize(destFs.getFileStatus(goodDestFile).getLen());
    registerFileInfo.setPath(goodDestFile.toString());
    registerFileInfo.setSubProcessId(Integer.parseInt(processId));
    OozieUtil oozieUtil = new OozieUtil();
    oozieUtil.persistBeanData(registerFileInfo, false);

    return 0;
}

From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java

License:Apache License

@Override
public int run(String[] strings) throws Exception {
    Configuration conf = getConf();
    String dir = conf.get(LindenJobConfig.INPUT_DIR, null);
    logger.info("input dir:" + dir);
    Path inputPath = new Path(StringUtils.unEscapeString(dir));
    Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR));
    String indexPath = conf.get(LindenJobConfig.INDEX_PATH);

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);/*from www.j  a  va  2  s  .  co  m*/
    }
    if (fs.exists(new Path(indexPath))) {
        fs.delete(new Path(indexPath), true);
    }

    int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1);
    Shard[] shards = createShards(indexPath, numShards);

    Shard.setIndexShards(conf, shards);

    //empty trash;
    (new Trash(conf)).expunge();

    Job job = Job.getInstance(conf, "linden-hadoop-indexing");
    job.setJarByClass(LindenJob.class);
    job.setMapperClass(LindenMapper.class);
    job.setCombinerClass(LindenCombiner.class);
    job.setReducerClass(LindenReducer.class);
    job.setMapOutputKeyClass(Shard.class);
    job.setMapOutputValueClass(IntermediateForm.class);
    job.setOutputKeyClass(Shard.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(IndexUpdateOutputFormat.class);
    job.setReduceSpeculativeExecution(false);
    job.setNumReduceTasks(numShards);

    String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL);
    if (lindenSchemaFile == null) {
        throw new IOException("no schema file is found");
    }
    logger.info("Adding schema file: " + lindenSchemaFile);
    job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema"));
    String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL);
    if (lindenPropertiesFile == null) {
        throw new IOException("no linden properties file is found");
    }
    logger.info("Adding linden properties file: " + lindenPropertiesFile);
    job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties"));

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    Path[] inputs = FileInputFormat.getInputPaths(job);
    StringBuilder buffer = new StringBuilder(inputs[0].toString());
    for (int i = 1; i < inputs.length; i++) {
        buffer.append(",");
        buffer.append(inputs[i].toString());
    }
    logger.info("mapreduce.input.dir = " + buffer.toString());
    logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString());
    logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks());
    logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS));
    logger.info("mapreduce.input.format.class = " + job.getInputFormatClass());
    logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass());
    logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR));

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed");
    }
    return 0;
}

From source file:com.xyz.reccommendation.driver.SKU2SKUCount.java

License:Apache License

public static void main(String[] args) throws Exception {

    final Configuration conf = new Configuration();

    String envt = null;//  ww w .  ja va2 s  .c om

    if (args.length > 0) {
        envt = args[0];
    } else {
        envt = "dev";
    }

    Properties prop = new Properties();

    try {
        // load a properties file from class path, inside static method
        prop.load(SKU2SKUCount.class.getClassLoader().getResourceAsStream("config-" + envt + ".properties"));

    } catch (IOException ex) {
        ex.printStackTrace();
        System.exit(1);
    }

    MongoConfigUtil.setOutputURI(conf, "mongodb://" + prop.getProperty("mongodb.ip") + "/"
            + prop.getProperty("mongodb.dbname") + ".out_stat_custom");

    log.debug("MongoDB URL : mongodb://" + prop.getProperty("mongodb.ip") + "/"
            + prop.getProperty("mongodb.dbname") + "." + ".out_stat_custom");

    log.debug("Conf: " + conf);

    MongoConfigUtil.setCreateInputSplits(conf, false);
    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    final Job job = new Job(conf,
            "Count the sku to sku mapping from pview data on hdfs in \"inputPview\" path.");

    job.setJarByClass(SKU2SKUCount.class);

    job.setMapperClass(TokenizerMapper.class);

    // job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BSONWritable.class);

    job.setInputFormatClass(KeyValueTextInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path("inputPview"));

    System.exit(job.waitForCompletion(true) ? 0 : 1);

}