Example usage for org.apache.hadoop.mapred JobConf setCompressMapOutput

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setCompressMapOutput.

Prototype

public void setCompressMapOutput(boolean compress)

Source Link

Document

Should the map outputs be compressed before transfer?

Usage

From source file:crunch.MaxTemperature.java

License:Apache License

public static void main(String[] args) throws IOException {
        if (args.length != 2) {
            System.err.println("Usage: MaxTemperatureWithMapOutputCompression " + "<input path> <output path>");
            System.exit(-1);// ww  w. j av  a  2s  . com
        }

        JobConf conf = new JobConf(MaxTemperatureWithCompression.class);
        conf.setJobName("Max temperature with map output compression");

        FileInputFormat.addInputPath(conf, new Path(args[0]));
        FileOutputFormat.setOutputPath(conf, new Path(args[1]));

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);

        // vv OldMaxTemperatureWithMapOutputCompression
        conf.setCompressMapOutput(true);
        conf.setMapOutputCompressorClass(GzipCodec.class);
        // ^^ OldMaxTemperatureWithMapOutputCompression

        conf.setMapperClass(MaxTemperatureMapper.class);
        conf.setCombinerClass(MaxTemperatureReducer.class);
        conf.setReducerClass(MaxTemperatureReducer.class);

        JobClient.runJob(conf);
    }

From source file:de.l3s.streamcorpus.mapreduce.TerrierIndexing.java

License:Mozilla Public License

/** Starts the MapReduce indexing.
 * @param args// ww  w .  j a v  a  2  s.c om
 * @throws Exception
 */
public int run(String[] args) throws Exception {
    long time = System.currentTimeMillis();

    // For the moment: Hard-code the terrier home to quick test
    System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer");

    boolean docPartitioned = false;
    int numberOfReducers = Integer
            .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
    final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
    if (args.length == 2 && args[0].equals("-p")) {
        logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices.");
        numberOfReducers = Integer.parseInt(args[1]);
        docPartitioned = true;
    } else if (args.length == 1 && args[0].equals("--merge")) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        else
            logger.error("No point merging 1 reduce task output");
        return 0;
    } else if (args.length == 0) {
        logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index.");
        docPartitioned = false;
        if (numberOfReducers > MAX_REDUCE) {
            logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use "
                    + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most");
        }
    }

    /*else
    {
       logger.fatal(usage());
       return 0;
    }*/

    if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0],
            false) instanceof BitCompressionConfiguration)) {
        logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing"
                + " - you can recompress the inverted index later using IndexRecompressor");
        return 0;
    }

    if (jf == null)
        throw new Exception("Could not get JobFactory from HadoopPlugin");
    final JobConf conf = jf.newJob();
    conf.setJarByClass(TerrierIndexing.class);
    conf.setJobName("StreamCorpusIndexer: Terrier Indexing");
    if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH)
            && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) {
        logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + ","
                + ApplicationSetup.TERRIER_INDEX_PREFIX);
        return 0;
    }

    // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
    boolean blockIndexing = true;
    if (blockIndexing) {
        conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
    } else {
        conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
    }
    FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
    conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
    conf.setMapOutputKeyClass(SplitEmittedTerm.class);
    conf.setMapOutputValueClass(MapEmittedPostingList.class);
    conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);

    if (!conf.get("mapred.job.tracker").equals("local")) {
        conf.setMapOutputCompressorClass(GzipCodec.class);
        conf.setCompressMapOutput(true);
    } else {
        conf.setCompressMapOutput(false);
    }

    conf.setInputFormat(MultiFileCollectionInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
    conf.setReduceSpeculativeExecution(false);
    //parse the collection.spec
    BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
    String line = null;
    List<Path> paths = new ArrayList<Path>();
    while ((line = specBR.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        paths.add(new Path(line));
    }
    specBR.close();
    FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()]));

    // not sure if this is effective in YARN
    conf.setNumMapTasks(2000);

    // increase the heap usage
    conf.set("mapreduce.map.memory.mb", "6100");
    conf.set("mapred.job.map.memory.mb", "6100");
    conf.set("mapreduce.reduce.memory.mb", "6144");
    conf.set("mapred.job.reduce.memory.mb", "6144");

    conf.set("mapreduce.map.java.opts", "-Xmx6100m");
    conf.set("mapred.map.child.java.opts", "-Xmx6100m");
    conf.set("mapreduce.reduce.java.opts", "-Xmx6144m");
    conf.set("mapred.reduce.child.opts", "-Xmx6144m");

    //conf.setBoolean("mapred.used.genericoptionsparser", true) ;

    // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it
    conf.set("mapreduce.job.user.classpath.first", "true");

    // increase the yarn memory to 10 GB
    conf.set("yarn.nodemanager.resource.memory-mb", "12288");
    conf.set("yarn.nodemanager.resource.cpu-vcores", "16");
    conf.set("yarn.scheduler.minimum-allocation-mb", "4096");

    conf.setNumReduceTasks(numberOfReducers);
    if (numberOfReducers > 1) {
        if (docPartitioned)
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
        else
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
    } else {
        //for JUnit tests, we seem to need to restore the original partitioner class
        conf.setPartitionerClass(HashPartitioner.class);
    }

    /*JobID jobId = null;
    boolean ranOK = true;
    try{
       RunningJob rj = JobClient.runJob(conf);
       jobId = rj.getID();
       HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) { 
       logger.error("Problem running job", e);
       e.printStackTrace();
       ranOK = false;
    }
    if (jobId != null)
    {
       deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
    }  */

    //if (ranOK)
    //{
    System.out.println("Merging indices");
    if (!docPartitioned) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
    }

    Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH,
            docPartitioned ? numberOfReducers : 1, jf);
    //}
    System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    jf.close();
    return 0;
}

From source file:de.l3s.streamcorpus.StreamCorpusIndexing.java

License:Mozilla Public License

/** Starts the MapReduce indexing.
 * @param args/*from  ww w.  j  ava2  s. c o m*/
 * @throws Exception
 */
public int run(String[] args) throws Exception {
    long time = System.currentTimeMillis();

    // For the moment: Hard-code the terrier home to quick test
    System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer");

    boolean docPartitioned = false;
    int numberOfReducers = Integer
            .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
    final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
    if (args.length == 2 && args[0].equals("-p")) {
        logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices.");
        numberOfReducers = Integer.parseInt(args[1]);
        docPartitioned = true;
    } else if (args.length == 1 && args[0].equals("--merge")) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        else
            logger.error("No point merging 1 reduce task output");
        return 0;
    } else if (args.length == 0) {
        logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index.");
        docPartitioned = false;
        if (numberOfReducers > MAX_REDUCE) {
            logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use "
                    + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most");
        }
    }

    /*else
    {
       logger.fatal(usage());
       return 0;
    }*/

    if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0],
            false) instanceof BitCompressionConfiguration)) {
        logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing"
                + " - you can recompress the inverted index later using IndexRecompressor");
        return 0;
    }

    if (jf == null)
        throw new Exception("Could not get JobFactory from HadoopPlugin");
    final JobConf conf = jf.newJob();
    conf.setJarByClass(StreamCorpusIndexing.class);
    conf.setJobName("StreamCorpusIndexer: Terrier Indexing");
    if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH)
            && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) {
        logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + ","
                + ApplicationSetup.TERRIER_INDEX_PREFIX);
        return 0;
    }

    // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
    boolean blockIndexing = true;
    if (blockIndexing) {
        conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
    } else {
        conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
    }
    FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
    conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
    conf.setMapOutputKeyClass(SplitEmittedTerm.class);
    conf.setMapOutputValueClass(MapEmittedPostingList.class);
    conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);

    if (!conf.get("mapred.job.tracker").equals("local")) {
        conf.setMapOutputCompressorClass(GzipCodec.class);
        conf.setCompressMapOutput(true);
    } else {
        conf.setCompressMapOutput(false);
    }

    conf.setInputFormat(MultiFileCollectionInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
    conf.setReduceSpeculativeExecution(false);
    //parse the collection.spec
    BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
    String line = null;
    List<Path> paths = new ArrayList<Path>();
    while ((line = specBR.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        paths.add(new Path(line));
    }
    specBR.close();
    FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()]));

    // not sure if this is effective in YARN
    conf.setNumMapTasks(2000);

    // increase the heap usage
    conf.set("mapreduce.map.memory.mb", "6100");
    conf.set("mapred.job.map.memory.mb", "6100");
    conf.set("mapreduce.reduce.memory.mb", "6144");
    conf.set("mapred.job.reduce.memory.mb", "6144");

    conf.set("mapreduce.map.java.opts", "-Xmx6100m");
    conf.set("mapred.map.child.java.opts", "-Xmx6100m");
    conf.set("mapreduce.reduce.java.opts", "-Xmx6144m");
    conf.set("mapred.reduce.child.opts", "-Xmx6144m");

    //conf.setBoolean("mapred.used.genericoptionsparser", true) ;

    // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it
    conf.set("mapreduce.job.user.classpath.first", "true");

    // increase the yarn memory to 10 GB
    conf.set("yarn.nodemanager.resource.memory-mb", "12288");
    conf.set("yarn.nodemanager.resource.cpu-vcores", "16");
    conf.set("yarn.scheduler.minimum-allocation-mb", "4096");

    conf.setNumReduceTasks(numberOfReducers);
    if (numberOfReducers > 1) {
        if (docPartitioned)
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
        else
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
    } else {
        //for JUnit tests, we seem to need to restore the original partitioner class
        conf.setPartitionerClass(HashPartitioner.class);
    }

    /*JobID jobId = null;
    boolean ranOK = true;
    try{
       RunningJob rj = JobClient.runJob(conf);
       jobId = rj.getID();
       HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) { 
       logger.error("Problem running job", e);
       e.printStackTrace();
       ranOK = false;
    }
    if (jobId != null)
    {
       deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
    }  */

    //if (ranOK)
    //{
    System.out.println("Merging indices");
    if (!docPartitioned) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
    }

    Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH,
            docPartitioned ? numberOfReducers : 1, jf);
    //}
    System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    jf.close();
    return 0;
}

From source file:de.tudarmstadt.lt.n2n.hadoop.FlipJoBims.java

License:Apache License

public static void main(String[] args) throws Exception {

    JobConf conf = new JobConf(FlipJoBims.class);

    /* begin necessary for UKP cluster */
    conf.setMemoryForMapTask(1000L); // 1 GB /* necessary for UKP cdh3 */
    conf.setMemoryForReduceTask(1000L); // 1 GB /* necessary for UKP cdh3 */
    FileOutputFormat.setCompressOutput(conf, true); // compress output
    FileOutputFormat.setOutputCompressorClass(conf,
            org.apache.hadoop.io.compress.BZip2Codec.class); /* use the bzip2 codec for compression */
    conf.setCompressMapOutput(true); // compress mapper output
    /* end necessary for UKP cluster */

    conf.setJobName(FlipJoBims.class.getSimpleName());
    args = new GenericOptionsParser(conf, args).getRemainingArgs();

    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(FlipJoBims.Map.class);
    conf.setNumReduceTasks(0);//from  w w  w .ja  v  a  2s.c om
    // conf.setReducerClass(IdentityReducer.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setOutputKeyClass(Text.class);

    conf.setMapOutputValueClass(Text.class);
    conf.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);

}

From source file:edu.brown.cs.mapreduce.benchmarks.Benchmark3.java

License:Open Source License

public int run(String[] args) throws Exception {
    BenchmarkBase base = new BenchmarkBase(this.getConf(), this.getClass(), args);

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);

    // -------------------------------------------
    // Phase #1//w  ww. j av a  2  s  .c  o  m
    // -------------------------------------------
    JobConf p1_job = base.getJobConf();
    p1_job.setJobName(p1_job.getJobName() + ".Phase1");
    Path p1_output = new Path(base.getOutputPath().toString() + "/phase1");
    FileOutputFormat.setOutputPath(p1_job, p1_output);

    //
    // Make sure we have our properties
    //
    String required[] = { BenchmarkBase.PROPERTY_START_DATE, BenchmarkBase.PROPERTY_STOP_DATE };
    for (String req : required) {
        if (!base.getOptions().containsKey(req)) {
            System.err.println("ERROR: The property '" + req + "' is not set");
            System.exit(1);
        }
    } // FOR

    p1_job.setInputFormat(
            base.getSequenceFile() ? SequenceFileInputFormat.class : KeyValueTextInputFormat.class);
    if (base.getSequenceFile())
        p1_job.setOutputFormat(SequenceFileOutputFormat.class);
    p1_job.setOutputKeyClass(Text.class);
    p1_job.setOutputValueClass(Text.class);
    p1_job.setMapperClass(
            base.getTupleData() ? edu.brown.cs.mapreduce.benchmarks.benchmark3.phase1.TupleWritableMap.class
                    : edu.brown.cs.mapreduce.benchmarks.benchmark3.phase1.TextMap.class);
    p1_job.setReducerClass(
            base.getTupleData() ? edu.brown.cs.mapreduce.benchmarks.benchmark3.phase1.TupleWritableReduce.class
                    : edu.brown.cs.mapreduce.benchmarks.benchmark3.phase1.TextReduce.class);
    p1_job.setCompressMapOutput(base.getCompress());

    // -------------------------------------------
    // Phase #2
    // -------------------------------------------
    JobConf p2_job = base.getJobConf();
    p2_job.setJobName(p2_job.getJobName() + ".Phase2");
    p2_job.setInputFormat(
            base.getSequenceFile() ? SequenceFileInputFormat.class : KeyValueTextInputFormat.class);
    if (base.getSequenceFile())
        p2_job.setOutputFormat(SequenceFileOutputFormat.class);
    p2_job.setOutputKeyClass(Text.class);
    p2_job.setOutputValueClass(Text.class);
    p2_job.setMapperClass(IdentityMapper.class);
    p2_job.setReducerClass(
            base.getTupleData() ? edu.brown.cs.mapreduce.benchmarks.benchmark3.phase2.TupleWritableReduce.class
                    : edu.brown.cs.mapreduce.benchmarks.benchmark3.phase2.TextReduce.class);
    p2_job.setCompressMapOutput(base.getCompress());
    p2_job.setNumMapTasks(60);

    // -------------------------------------------
    // Phase #3
    // -------------------------------------------
    JobConf p3_job = base.getJobConf();
    p3_job.setJobName(p3_job.getJobName() + ".Phase3");
    p3_job.setNumReduceTasks(1);
    p3_job.setInputFormat(
            base.getSequenceFile() ? SequenceFileInputFormat.class : KeyValueTextInputFormat.class);
    p3_job.setOutputKeyClass(Text.class);
    p3_job.setOutputValueClass(Text.class);
    //p3_job.setMapperClass(Phase3Map.class);
    p3_job.setMapperClass(IdentityMapper.class);
    p3_job.setReducerClass(
            base.getTupleData() ? edu.brown.cs.mapreduce.benchmarks.benchmark3.phase3.TupleWritableReduce.class
                    : edu.brown.cs.mapreduce.benchmarks.benchmark3.phase3.TextReduce.class);

    //
    // Execute #1
    //
    base.runJob(p1_job);

    //
    // Execute #2
    //
    Path p2_output = new Path(base.getOutputPath().toString() + "/phase2");
    FileOutputFormat.setOutputPath(p2_job, p2_output);
    FileInputFormat.setInputPaths(p2_job, p1_output);
    base.runJob(p2_job);

    //
    // Execute #3
    //
    Path p3_output = new Path(base.getOutputPath().toString() + "/phase3");
    FileOutputFormat.setOutputPath(p3_job, p3_output);
    FileInputFormat.setInputPaths(p3_job, p2_output);
    base.runJob(p3_job);

    // There does need to be a combine if (base.getCombine()) base.runCombine();

    return 0;
}

From source file:edu.brown.cs.mapreduce.benchmarks.Benchmark4.java

License:Open Source License

public int run(String[] args) throws Exception {
    BenchmarkBase base = new BenchmarkBase(this.getConf(), this.getClass(), args);
    JobConf job = base.getJobConf();

    job.setInputFormat(TextInputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setMapperClass(Benchmark4.Map.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    try {/*from  w  w  w . j  a v a  2s . c o  m*/
        job.setCompressMapOutput(base.getCompress());
        base.runJob(job);

        if (base.getCombine())
            base.runCombine();
    } catch (Exception ex) {
        ex.printStackTrace();
        System.exit(1);
    }
    return 0;
}

From source file:edu.ohsu.sonmezsysbio.cloudbreak.command.CommandNovoalignSingleEnds.java

public void runHadoopJob(Configuration configuration) throws IOException, URISyntaxException {
    JobConf conf = new JobConf(configuration);

    conf.setJobName("Single End Alignment");
    conf.setJarByClass(Cloudbreak.class);
    FileInputFormat.addInputPath(conf, new Path(hdfsDataDir));
    Path outputDir = new Path(hdfsAlignmentsDir);
    FileSystem.get(conf).delete(outputDir);
    FileOutputFormat.setOutputPath(conf, outputDir);

    addDistributedCacheFile(conf, reference, "novoalign.reference");

    addDistributedCacheFile(conf, pathToNovoalign, "novoalign.executable");
    if (pathToNovoalignLicense != null) {
        addDistributedCacheFile(conf, pathToNovoalignLicense, "novoalign.license");
    }//from w w  w .  j a  v a  2s  .c  om

    DistributedCache.createSymlink(conf);
    conf.set("mapred.task.timeout", "3600000");
    conf.set("novoalign.threshold", threshold);
    conf.set("novoalign.quality.format", qualityFormat);

    conf.setInputFormat(SequenceFileInputFormat.class);

    conf.setMapperClass(NovoalignSingleEndMapper.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setCompressMapOutput(true);

    conf.setReducerClass(SingleEndAlignmentsToPairsReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.set("mapred.output.compress", "true");
    conf.set("mapred.output.compression", "org.apache.hadoop.io.compress.SnappyCodec");

    JobClient.runJob(conf);

}

From source file:kafka.etl.impl.DataGenerator.java

License:Apache License

protected void generateOffsets() throws Exception {
    JobConf conf = new JobConf();
    conf.set("hadoop.job.ugi", _props.getProperty("hadoop.job.ugi"));
    conf.setCompressMapOutput(false);
    Path outPath = new Path(_offsetsDir + Path.SEPARATOR + "1.dat");
    FileSystem fs = outPath.getFileSystem(conf);
    if (fs.exists(outPath))
        fs.delete(outPath);//w  w w. j a  v a 2 s .  c o  m

    KafkaETLRequest request = new KafkaETLRequest(_topic, "tcp://" + _uri.getHost() + ":" + _uri.getPort(), 0);

    System.out.println("Dump " + request.toString() + " to " + outPath.toUri().toString());
    byte[] bytes = request.toString().getBytes("UTF-8");
    KafkaETLKey dummyKey = new KafkaETLKey();
    SequenceFile.setCompressionType(conf, SequenceFile.CompressionType.NONE);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outPath, KafkaETLKey.class,
            BytesWritable.class);
    writer.append(dummyKey, new BytesWritable(bytes));
    writer.close();
}

From source file:kafka.etl.tweet.producer.TweetProducer.java

License:Apache License

protected void generateOffsets() throws Exception {
    JobConf conf = new JobConf();
    java.util.Date date = new java.util.Date();
    conf.set("hadoop.job.ugi", _props.getProperty("hadoop.job.ugi"));
    conf.setCompressMapOutput(false);
    Calendar cal = Calendar.getInstance();
    Path outPath = new Path(_offsetsDir + Path.SEPARATOR + "1.dat");
    FileSystem fs = outPath.getFileSystem(conf);
    if (fs.exists(outPath))
        fs.delete(outPath);// w  w  w.  j a v  a 2 s . com

    KafkaETLRequest request = new KafkaETLRequest(_topic, "tcp://" + _uri.getHost() + ":" + _uri.getPort(), 0);

    System.out.println("Dump " + request.toString() + " to " + outPath.toUri().toString());

    byte[] bytes = request.toString().getBytes("UTF-8");
    KafkaETLKey dummyKey = new KafkaETLKey();
    SequenceFile.setDefaultCompressionType(conf, SequenceFile.CompressionType.NONE);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, outPath, KafkaETLKey.class,
            BytesWritable.class);
    writer.append(dummyKey, new BytesWritable(bytes));
    writer.close();
}

From source file:org.sf.xrime.algorithms.clique.maximal.AllMaximalCliquesGenerate.java

License:Apache License

@Override
public void execute() throws ProcessorExecutionException {
    JobConf conf = new JobConf(context, AllMaximalCliquesGenerate.class);
    conf.setJobName("AllMaximalCliquesGenerate");

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(SetOfVertexSets.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(MapClass.class);
    // Combiner is not permitted.
    conf.setReducerClass(ReduceClass.class);
    // makes the file format suitable for machine processing.
    conf.setInputFormat(SequenceFileInputFormat.class);
    // Enable compression.
    conf.setCompressMapOutput(true);
    conf.setMapOutputCompressorClass(GzipCodec.class);
    try {//from w  ww.  j  a v a 2 s. c  om
        FileInputFormat.setInputPaths(conf, getSource().getPath());
        FileOutputFormat.setOutputPath(conf, getDestination().getPath());
    } catch (IllegalAccessException e1) {
        throw new ProcessorExecutionException(e1);
    }
    conf.setNumMapTasks(getMapperNum());
    conf.setNumReduceTasks(getReducerNum());

    try {
        this.runningJob = JobClient.runJob(conf);
    } catch (IOException e) {
        throw new ProcessorExecutionException(e);
    }
}