Example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

List of usage examples for org.apache.hadoop.mapreduce Job setMapOutputKeyClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setMapOutputKeyClass.

Prototype

public void setMapOutputKeyClass(Class<?> theClass) throws IllegalStateException 

Source Link

Document

Set the key class for the map output data.

Usage

From source file:com.twitter.elephanttwin.lucene.indexing.HadoopSplitIndexingJob.java

License:Apache License

/**
 * Override and extend this in implementations to add custom settings to the Job and Conf to
 * create lucene-based indexes that will point you at what splits contain values you are looking for.
 * You are on your own for filtering the splits appropriately before creating an MR job.. but
 * check out how this was done over MapFile-based indexes in
 * com.twitter.elephanttwin.indexing.AbstractIndexesFileInputFormat
 *///from   w w w. jav  a  2  s  .  com
@Override
protected void setupJob(Job job) {
    Configuration conf = job.getConfiguration();
    conf.set("mapred.child.java.opts", "-Xmx4g");
    List<String> fieldNames = Lists.newArrayList();
    for (IndexedField field : getIndexedFields()) {
        fieldNames.add(field.getFieldName());
        conf.set(HadoopSplitIndexingMapper.FIELD_VALUE_EXTRACTOR_PREFIX + field.getFieldName(),
                getExtractorClassName(field.getFieldName()));
    }
    conf.setStrings(HadoopSplitIndexingMapper.FIELDS_TO_INDEX_KEY, fieldNames.toArray(new String[] {}));
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(HadoopSplitDocument.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setInputFormatClass(getInputFormatClass());

    job.setMapperClass(HadoopSplitIndexingMapper.class);
    job.setReducerClass(HadoopSplitIndexingReducer.class);
}

From source file:com.twitter.elephanttwin.lucene.indexing.TextIndexingJob.java

License:Apache License

@Override
protected void setupJob(Job job) {
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(NullOutputFormat.class);
    job.setMapperClass(TextIndexingMapper.class);
    job.setReducerClass(TextIndexingReducer.class);
}

From source file:com.twitter.elephanttwin.retrieval.ScanUsingIndexJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    params = new IndexConfig();

    LOG.info(" - input: " + Joiner.on(" ").join(params.getInput()));
    LOG.info(" - output: " + IndexConfig.output.get());

    Configuration conf = getConf();

    Path outputDir = new Path(params.getOutput());
    FileSystem fs = outputDir.getFileSystem(conf);
    fs.delete(outputDir, true);//from   ww w .  j a v a 2s.c  o  m

    int totalInputFiles = 0;
    List<FileStatus> stats = Lists.newArrayList();
    for (String s : params.getInput()) {
        Path spath = new Path(IndexConfig.index.get() + s);
        HdfsUtils.addInputPathRecursively(stats, fs, spath, hiddenDirectoryFilter, indexDataFilter);
    }

    totalInputFiles = stats.size();
    LOG.info(totalInputFiles + " total index files to be scanned");

    conf.set(IndexScanMapper.searchColumnName, params.getColumnName());
    Job job = new Job(new Configuration(conf));
    job.setJarByClass(getClass());
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(job, new Path(params.getOutput()));

    for (FileStatus file : stats)
        FileInputFormat.addInputPath(job, file.getPath());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    job.setNumReduceTasks(1);

    job.setMapperClass(IndexScanMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    job.setJobName("ScanUsingIndexJob:" + IndexConfig.input.get());
    BlockIndexedFileInputFormat.setSearchOptions(job, params.getinputFormat(), params.getValueClass(),
            params.getIndex(), (String) null);
    job.waitForCompletion(true);
    return 0;
}

From source file:com.veera.secondarysort.demo2.SsJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf, "secondary sort");

    job.setJarByClass(SsJob.class);
    job.setPartitionerClass(NaturalKeyPartitioner.class);
    job.setGroupingComparatorClass(NaturalKeyGroupingComparator.class);
    job.setSortComparatorClass(CompositeKeyComparator.class);

    job.setMapOutputKeyClass(StockKey.class);
    job.setMapOutputValueClass(DoubleWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(SsMapper.class);
    job.setReducerClass(SsReducer.class);

    job.waitForCompletion(true);//from  ww  w.  j av  a 2 s. co m

    return 0;
}

From source file:com.wibidata.wibidota.DotaGatherExampleValues.java

License:Apache License

public final int run(final String[] args) throws Exception {
    Job job = new Job(super.getConf(), "Dota Gatherer Example Values");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setMapperClass(EnumGatherMap.class);
    job.setCombinerClass(AppendText.class);
    job.setReducerClass(EnumGatherReducer.class);

    job.setJarByClass(DotaGatherExampleValues.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (job.waitForCompletion(true)) {
        return 0;
    } else {/*  w  w  w.  j ava2  s  .  c  om*/
        return -1;
    }
}

From source file:com.wibidata.wibidota.dotaloader.DotaValuesCounter.java

License:Apache License

public final int run(final String[] args) throws Exception {
    Job job = new Job(super.getConf(), "Dota Value Counter");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(Map.class);
    job.setCombinerClass(Add.class);
    job.setReducerClass(Add.class);

    job.setJarByClass(DotaValuesCounter.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (job.waitForCompletion(true)) {
        return 0;
    } else {//from  ww  w.  j  a v  a 2s  .  com
        return -1;
    }
}

From source file:com.wibidata.wibidota.DotaMaxAccountId.java

License:Apache License

public final int run(final String[] args) throws Exception {
    Job job = new Job(super.getConf(), "Dota Max Builder");
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setMapperClass(DotaMaxAccountId.Map.class);
    job.setCombinerClass(DotaMaxAccountId.TakeMax.class);
    job.setReducerClass(DotaMaxAccountId.TakeMax.class);

    job.setJarByClass(DotaMaxAccountId.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (job.waitForCompletion(true)) {
        return 0;
    } else {/*from  ww w. ja  v a  2 s. c om*/
        return -1;
    }
}

From source file:com.wipro.ats.bdre.dq.DQDriver.java

License:Apache License

@Override
public int run(String[] arg) throws Exception {
    String processId = arg[0];//from  www.  ja va 2s. c  o m
    String sPath = arg[1];
    String destDir = arg[2];

    Properties props = new GetProperties().getProperties(processId, "dq");
    LOGGER.debug("props=" + props);
    Configuration conf = getConf();

    conf.set("dq.process.id", processId);
    Job job = Job.getInstance(conf);
    job.setJobName("Data Quality " + processId);
    job.setJarByClass(DQDriver.class);
    job.setMapperClass(DQMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    //Reducer is not required
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);
    Path inputFilePath = new Path(sPath);
    FileInputFormat.addInputPath(job, inputFilePath);
    FileOutputFormat.setOutputPath(job, removeIfExistAndSetOutputPath(conf, destDir));
    MultipleOutputs.addNamedOutput(job, DQConstants.GOOD_RECORDS_FILE, TextOutputFormat.class, Text.class,
            NullWritable.class);
    MultipleOutputs.addNamedOutput(job, DQConstants.BAD_RECORDS_FILE, TextOutputFormat.class, Text.class,
            NullWritable.class);
    MultipleOutputs.addNamedOutput(job, DQConstants.FILE_REPORT_FILE, TextOutputFormat.class, Text.class,
            NullWritable.class);

    if (!job.waitForCompletion(true)) {
        return 1;
    }

    Path outputDir = new Path(destDir);
    FileSystem srcFs = outputDir.getFileSystem(getConf());
    FileSystem destFs = outputDir.getFileSystem(getConf());

    //Valid Records
    Path goodFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_GOOD_RECORD_OUTPUT_DIR);
    //Input and quality filtered file should have same name (but different path)
    Path goodDestFile = new Path(destDir + "/" + inputFilePath.getName());
    if (srcFs.exists(goodFilesSrcDir)) {
        FileUtil.copyMerge(srcFs, goodFilesSrcDir, destFs, goodDestFile, true, conf, "");
    }
    // Invalid Records
    Path badFilesSrcDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_BAD_RECORD_OUTPUT_DIR);
    Path badDestFile = new Path(destDir + "/" + DQConstants.BAD_RECORDS_FILE);
    if (srcFs.exists(badFilesSrcDir)) {
        FileUtil.copyMerge(srcFs, badFilesSrcDir, destFs, badDestFile, true, conf, "");
    }

    // Preparing report aggregation job
    Job fileReportAggregationJob = Job.getInstance(conf);
    fileReportAggregationJob.setJobName("File Report Computing " + processId);
    fileReportAggregationJob.setJarByClass(DQMain.class);

    fileReportAggregationJob.setMapperClass(DQFileReportMapper.class);
    fileReportAggregationJob.setMapOutputKeyClass(Text.class);
    fileReportAggregationJob.setMapOutputValueClass(IntWritable.class);

    fileReportAggregationJob.setReducerClass(DQFileReportReducer.class);
    fileReportAggregationJob.setOutputKeyClass(Text.class);
    fileReportAggregationJob.setOutputValueClass(Text.class);

    fileReportAggregationJob.setNumReduceTasks(1);

    Path fileReportDir = new Path(destDir + "/" + DQConstants.INTERMEDIATE_REPORT_OUTPUT_DIR);
    Path fileReportOutputDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER);

    FileInputFormat.addInputPath(fileReportAggregationJob, fileReportDir);
    FileOutputFormat.setOutputPath(fileReportAggregationJob, fileReportOutputDir);

    if (!fileReportAggregationJob.waitForCompletion(true)) {
        return 1;
    }

    // Merge Report Records MR stuffs
    Path reportsSrcDir = new Path(destDir + "/" + DQConstants.AGGREGATED_REPORT_PLACEHOLDER_FOLDER);
    Path reportsDestFile = new Path(destDir + "/" + DQConstants.FILE_REPORT_FILE);
    FileUtil.copyMerge(srcFs, reportsSrcDir, destFs, reportsDestFile, true, conf, "");

    Path reportDestFile = new Path(outputDir.toString() + "/" + DQConstants.FILE_REPORT_FILE);
    //Read the report file from HDFS and report the percentage
    DQStats dqStats = getQualityStats(getConf(), reportDestFile);
    LOGGER.info("Percentage of good records :" + dqStats.getGoodPercent());
    props = new GetProperties().getProperties(processId, "dq");
    String strThreshold = props.getProperty("min.pass.threshold.percent");
    float threshold = Float.parseFloat(strThreshold);
    dqStats.setThreshold(threshold);
    //Update the result in metadata
    logResult(dqStats, processId, 0L);
    if (dqStats.getGoodPercent() < threshold) {
        LOGGER.error("DQ check did not pass");
        throw new DQValidationException(dqStats);
    }
    LOGGER.info(dqStats);
    FileChecksum hdfsChecksum = destFs.getFileChecksum(goodDestFile);
    String fileHash = hdfsChecksum == null ? "0" : hdfsChecksum.toString();
    //Return file info oozie params
    RegisterFileInfo registerFileInfo = new RegisterFileInfo();
    registerFileInfo.setBatchId(null);
    registerFileInfo.setCreationTs(new Timestamp(new Date().getTime()));
    registerFileInfo.setFileHash(fileHash);
    registerFileInfo.setFileSize(destFs.getFileStatus(goodDestFile).getLen());
    registerFileInfo.setPath(goodDestFile.toString());
    registerFileInfo.setSubProcessId(Integer.parseInt(processId));
    OozieUtil oozieUtil = new OozieUtil();
    oozieUtil.persistBeanData(registerFileInfo, false);

    return 0;
}

From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java

License:Apache License

@Override
public int run(String[] strings) throws Exception {
    Configuration conf = getConf();
    String dir = conf.get(LindenJobConfig.INPUT_DIR, null);
    logger.info("input dir:" + dir);
    Path inputPath = new Path(StringUtils.unEscapeString(dir));
    Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR));
    String indexPath = conf.get(LindenJobConfig.INDEX_PATH);

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);/*from www.j  a  va  2  s  .  co  m*/
    }
    if (fs.exists(new Path(indexPath))) {
        fs.delete(new Path(indexPath), true);
    }

    int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1);
    Shard[] shards = createShards(indexPath, numShards);

    Shard.setIndexShards(conf, shards);

    //empty trash;
    (new Trash(conf)).expunge();

    Job job = Job.getInstance(conf, "linden-hadoop-indexing");
    job.setJarByClass(LindenJob.class);
    job.setMapperClass(LindenMapper.class);
    job.setCombinerClass(LindenCombiner.class);
    job.setReducerClass(LindenReducer.class);
    job.setMapOutputKeyClass(Shard.class);
    job.setMapOutputValueClass(IntermediateForm.class);
    job.setOutputKeyClass(Shard.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(IndexUpdateOutputFormat.class);
    job.setReduceSpeculativeExecution(false);
    job.setNumReduceTasks(numShards);

    String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL);
    if (lindenSchemaFile == null) {
        throw new IOException("no schema file is found");
    }
    logger.info("Adding schema file: " + lindenSchemaFile);
    job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema"));
    String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL);
    if (lindenPropertiesFile == null) {
        throw new IOException("no linden properties file is found");
    }
    logger.info("Adding linden properties file: " + lindenPropertiesFile);
    job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties"));

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    Path[] inputs = FileInputFormat.getInputPaths(job);
    StringBuilder buffer = new StringBuilder(inputs[0].toString());
    for (int i = 1; i < inputs.length; i++) {
        buffer.append(",");
        buffer.append(inputs[i].toString());
    }
    logger.info("mapreduce.input.dir = " + buffer.toString());
    logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString());
    logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks());
    logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS));
    logger.info("mapreduce.input.format.class = " + job.getInputFormatClass());
    logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass());
    logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR));

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed");
    }
    return 0;
}

From source file:com.xyz.reccommendation.driver.SKU2SKUCount.java

License:Apache License

public static void main(String[] args) throws Exception {

    final Configuration conf = new Configuration();

    String envt = null;//  ww w .  ja va2 s  .c om

    if (args.length > 0) {
        envt = args[0];
    } else {
        envt = "dev";
    }

    Properties prop = new Properties();

    try {
        // load a properties file from class path, inside static method
        prop.load(SKU2SKUCount.class.getClassLoader().getResourceAsStream("config-" + envt + ".properties"));

    } catch (IOException ex) {
        ex.printStackTrace();
        System.exit(1);
    }

    MongoConfigUtil.setOutputURI(conf, "mongodb://" + prop.getProperty("mongodb.ip") + "/"
            + prop.getProperty("mongodb.dbname") + ".out_stat_custom");

    log.debug("MongoDB URL : mongodb://" + prop.getProperty("mongodb.ip") + "/"
            + prop.getProperty("mongodb.dbname") + "." + ".out_stat_custom");

    log.debug("Conf: " + conf);

    MongoConfigUtil.setCreateInputSplits(conf, false);
    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    final Job job = new Job(conf,
            "Count the sku to sku mapping from pview data on hdfs in \"inputPview\" path.");

    job.setJarByClass(SKU2SKUCount.class);

    job.setMapperClass(TokenizerMapper.class);

    // job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BSONWritable.class);

    job.setInputFormatClass(KeyValueTextInputFormat.class);
    job.setOutputFormatClass(MongoOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path("inputPview"));

    System.exit(job.waitForCompletion(true) ? 0 : 1);

}