Example usage for org.apache.hadoop.mapred JobConf setOutputKeyComparatorClass

List of usage examples for org.apache.hadoop.mapred JobConf setOutputKeyComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setOutputKeyComparatorClass.

Prototype

public void setOutputKeyComparatorClass(Class<? extends RawComparator> theClass) 

Source Link

Document

Set the RawComparator comparator used to compare keys.

Usage

From source file:org.terrier.applications.HadoopIndexing.java

License:Mozilla Public License

/** Starts the MapReduce indexing.
 * @param args//  w w w . ja va 2  s  .  co  m
 * @throws Exception
 */
public static void main(String[] args) throws Exception {
    long time = System.currentTimeMillis();

    boolean docPartitioned = false;
    int numberOfReducers = Integer
            .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
    final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
    if (args.length == 2 && args[0].equals("-p")) {
        logger.info("Document-partitioned Mode, " + numberOfReducers + " output indices.");
        numberOfReducers = Integer.parseInt(args[1]);
        docPartitioned = true;
    } else if (args.length == 1 && args[0].equals("--merge")) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        else
            logger.error("No point merging 1 reduce task output");
        return;
    } else if (args.length == 0) {
        logger.info("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index.");
        docPartitioned = false;
        if (numberOfReducers > MAX_REDUCE) {
            logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use "
                    + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most");
        }
    } else {
        logger.fatal(usage());
        return;
    }

    if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0],
            false) instanceof BitCompressionConfiguration)) {
        logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing"
                + " - you can recompress the inverted index later using IndexRecompressor");
        return;
    }

    if (jf == null)
        throw new Exception("Could not get JobFactory from HadoopPlugin");
    final JobConf conf = jf.newJob();
    conf.setJobName("terrierIndexing");
    if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH)
            && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) {
        logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + ","
                + ApplicationSetup.TERRIER_INDEX_PREFIX);
        return;
    }

    boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
    if (blockIndexing) {
        conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
    } else {
        conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
    }
    FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
    conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
    conf.setMapOutputKeyClass(SplitEmittedTerm.class);
    conf.setMapOutputValueClass(MapEmittedPostingList.class);
    conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);

    if (!conf.get("mapred.job.tracker").equals("local")) {
        conf.setMapOutputCompressorClass(GzipCodec.class);
        conf.setCompressMapOutput(true);
    } else {
        conf.setCompressMapOutput(false);
    }

    conf.setInputFormat(MultiFileCollectionInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
    conf.setReduceSpeculativeExecution(false);
    //parse the collection.spec
    BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
    String line = null;
    List<Path> paths = new ArrayList<Path>();
    while ((line = specBR.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        paths.add(new Path(line));
    }
    specBR.close();
    FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()]));
    conf.setNumReduceTasks(numberOfReducers);
    if (numberOfReducers > 1) {
        if (docPartitioned)
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
        else
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
    } else {
        //for JUnit tests, we seem to need to restore the original partitioner class
        conf.setPartitionerClass(HashPartitioner.class);
    }

    JobID jobId = null;
    boolean ranOK = true;
    try {
        RunningJob rj = JobClient.runJob(conf);
        jobId = rj.getID();
        HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) {
        logger.error("Problem running job", e);
        ranOK = false;
    }
    if (jobId != null) {
        deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
    }
    if (ranOK) {
        if (!docPartitioned) {
            if (numberOfReducers > 1)
                mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        }

        Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH,
                docPartitioned ? numberOfReducers : 1, jf);
    }
    System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    jf.close();
}

From source file:ucsc.hadoop.mapreduce.apache.Grep.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }//from ww  w  .java2  s .c  o m

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf grepJob = new JobConf(getConf(), Grep.class);

    try {

        grepJob.setJobName("grep-search");

        FileInputFormat.setInputPaths(grepJob, args[0]);

        grepJob.setMapperClass(RegexMapper.class);
        grepJob.set("mapred.mapper.regex", args[2]);
        if (args.length == 4) {
            grepJob.set("mapred.mapper.regex.group", args[3]);
        }

        grepJob.setCombinerClass(LongSumReducer.class);
        grepJob.setReducerClass(LongSumReducer.class);

        FileOutputFormat.setOutputPath(grepJob, tempDir);
        grepJob.setOutputFormat(SequenceFileOutputFormat.class);
        grepJob.setOutputKeyClass(Text.class);
        grepJob.setOutputValueClass(LongWritable.class);

        JobClient.runJob(grepJob);

        // second job
        JobConf sortJob = new JobConf(getConf(), Grep.class);
        sortJob.setJobName("grep-sort");

        FileInputFormat.setInputPaths(sortJob, tempDir);
        sortJob.setInputFormat(SequenceFileInputFormat.class);

        sortJob.setMapperClass(InverseMapper.class);

        sortJob.setNumReduceTasks(1); // write a single file
        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
        // sort by decreasing freq
        sortJob.setOutputKeyComparatorClass(LongWritable.DecreasingComparator.class);

        JobClient.runJob(sortJob);
    } finally {
        FileSystem.get(grepJob).delete(tempDir, true);
    }
    return 0;
}

From source file:voldemort.store.readonly.mr.serialization.JsonMapper.java

License:Apache License

public void configure(JobConf conf) {
    setInputKeySerializer(getSchemaFromJob(conf, "mapper.input.key.schema"));
    setInputValueSerializer(getSchemaFromJob(conf, "mapper.input.value.schema"));
    setOutputKeySerializer(getSchemaFromJob(conf, "mapper.output.key.schema"));
    setOutputValueSerializer(getSchemaFromJob(conf, "mapper.output.value.schema"));

    // set comparator for input Key Schema
    if (conf.getBoolean("use.json.comparator", false)) {
        conf.setOutputKeyComparatorClass(JsonDeserializerComparator.class);
        conf.set("json.schema", conf.get("mapper.output.key.schema"));
    }//from ww  w  . j  ava  2s  .co  m
    setConfigured(true);
}

From source file:voldemort.store.readonly.mr.serialization.JsonReducer.java

License:Apache License

public void configure(JobConf conf) {
    setInputKeySerializer(getSchemaFromJob(conf, "mapper.output.key.schema"));
    setInputValueSerializer(getSchemaFromJob(conf, "mapper.output.value.schema"));
    setOutputKeySerializer(getSchemaFromJob(conf, "reducer.output.key.schema"));
    setOutputValueSerializer(getSchemaFromJob(conf, "reducer.output.value.schema"));

    // set comparator for input Key Schema
    if (conf.getBoolean("use.json.comparator", false)) {
        conf.setOutputKeyComparatorClass(JsonDeserializerComparator.class);
        conf.set("json.schema", conf.get("mapper.output.key.schema"));
    }/*from  w  w w. j a va 2  s  . c o m*/
    setConfigured(true);
}