Example usage for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType

List of usage examples for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType.

Prototype

public static void setOutputCompressionType(Job job, CompressionType style) 

Source Link

Document

Set the CompressionType for the output SequenceFile .

Usage

From source file:edu.umd.cloud9.collection.ExtractHTMLFieldCollection.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
@Override/*  ww  w . j  av  a2s .c  o m*/
public int runTool() throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf);

    String inputPath = conf.get("Cloud9.InputPath");
    String inputFormat = conf.get("Cloud9.InputFormat");
    String outputPath = conf.get("Cloud9.OutputPath");
    String tag = conf.get("Cloud9.TargetTag");

    job.setJobName("ExtractFieldCollection");

    job.setJarByClass(ExtractHTMLFieldCollection.class);
    job.setMapperClass(MyMapper.class);
    job.setReducerClass(Reducer.class);
    job.setNumReduceTasks(200);

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
    recursivelyAddInputPaths(job, inputPath);

    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(TextDocument.class);

    LOG.info("ExtractFieldCollection - " + tag);
    LOG.info(" - Input path: " + inputPath);
    LOG.info(" - Input format: " + inputFormat);
    LOG.info(" - Output path: " + outputPath);
    LOG.info(" - Target tag: " + tag);

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.umd.cloud9.collection.trecweb.RepackTrecWebCollection.java

License:Apache License

/**
 * Runs this tool./* w w w.  j a  v  a 2  s.  c om*/
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path")
            .create(COLLECTION_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output path")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("type").hasArg()
            .withDescription("(required) compression type: 'block', 'record', or 'none'")
            .create(COMPRESSION_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(COMPRESSION_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String collection = cmdline.getOptionValue(COLLECTION_OPTION);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);
    String compressionType = cmdline.getOptionValue(COMPRESSION_OPTION);

    if (!compressionType.equals("block") && !compressionType.equals("record")
            && !compressionType.equals("none")) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        System.exit(-1);
    }

    // This is the default block size.
    int blocksize = 1000000;

    Job job = new Job(getConf(), RepackTrecWebCollection.class.getSimpleName() + ":" + collection);
    FileSystem fs = FileSystem.get(job.getConfiguration());

    job.setJarByClass(RepackTrecWebCollection.class);

    LOG.info("Tool name: " + RepackTrecWebCollection.class.getCanonicalName());
    LOG.info(" - collection path: " + collection);
    LOG.info(" - output path: " + output);
    LOG.info(" - compression type: " + compressionType);

    if (compressionType.equals("block")) {
        LOG.info(" - block size: " + blocksize);
    }

    Path collectionPath = new Path(collection);
    for (FileStatus status : fs.listStatus(collectionPath)) {
        if (status.isDirectory()) {
            for (FileStatus s : fs.listStatus(status.getPath())) {
                FileInputFormat.addInputPath(job, s.getPath());
            }
        } else {
            FileInputFormat.addInputPath(job, status.getPath());
        }
    }

    // Hack to figure out number of reducers.
    int numReducers = 100;
    if (collection.toLowerCase().contains("wt10g")) {
        numReducers = 50;
    } else if (collection.toLowerCase().contains("gov2")) {
        numReducers = 200;
    }
    LOG.info(" - number of reducers: " + numReducers);
    job.setNumReduceTasks(numReducers);

    FileOutputFormat.setOutputPath(job, new Path(output));

    if (compressionType.equals("none")) {
        SequenceFileOutputFormat.setCompressOutput(job, false);
    } else {
        SequenceFileOutputFormat.setCompressOutput(job, true);

        if (compressionType.equals("record")) {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
            job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    job.setInputFormatClass(TrecWebDocumentInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(TrecWebDocument.class);

    job.setMapperClass(MyMapper.class);

    // delete the output directory if it exists already
    fs.delete(new Path(output), true);

    try {
        job.waitForCompletion(true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    return 0;
}

From source file:edu.umd.cloud9.collection.wikipedia.RepackWikipedia.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*from w  w  w  .j av  a  2 s. c  o  m*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file")
            .create(MAPPING_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("block|record|none").hasArg()
            .withDescription("compression type").create(COMPRESSION_TYPE_OPTION));
    options.addOption(OptionBuilder
            .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr")
            .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION);
    String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION);

    if (!"block".equals(compressionType) && !"record".equals(compressionType)
            && !"none".equals(compressionType)) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (!(language.length() == 2 || language.length() == 6)) {
            // Added length check for 6 to include languages like zh_yue
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    // this is the default block size
    int blocksize = 1000000;

    Job job = Job.getInstance(getConf());
    job.setJarByClass(RepackWikipedia.class);
    job.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language));

    job.getConfiguration().set(DOCNO_MAPPING_FIELD, mappingFile);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - docno mapping data file: " + mappingFile);
    LOG.info(" - compression type: " + compressionType);
    LOG.info(" - language: " + language);

    if ("block".equals(compressionType)) {
        LOG.info(" - block size: " + blocksize);
    }

    job.setNumReduceTasks(0);

    SequenceFileInputFormat.addInputPath(job, new Path(inputPath));
    SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath));

    if ("none".equals(compressionType)) {
        SequenceFileOutputFormat.setCompressOutput(job, false);
    } else {
        SequenceFileOutputFormat.setCompressOutput(job, true);

        if ("record".equals(compressionType)) {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
            job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    if (language != null) {
        job.getConfiguration().set("wiki.language", language);
    }

    job.setInputFormatClass(WikipediaPageInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(WikipediaPageFactory.getWikipediaPageClass(language));

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.umd.cloud9.webgraph.driver.wt10g.GenericExtractLinks.java

License:Apache License

@Override
public int runTool() throws Exception {

    Configuration conf = getConf();
    Job job = new Job(conf);

    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");

    String mappingFile = conf.get("Cloud9.DocnoMappingFile");

    FileSystem fs = FileSystem.get(conf);
    if (!fs.exists(new Path(mappingFile))) {
        throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!");
    }// ww w  .jav  a2s  .  co m

    DistributedCache.addCacheFile(new Path(mappingFile).toUri(), job.getConfiguration());

    job.setJobName("ExtractLinks");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);

    job.setNumReduceTasks(numReducers);

    job.setMapperClass(GenericExtractLinks.Map.class);
    job.setCombinerClass(GenericExtractLinks.Reduce.class);
    job.setReducerClass(GenericExtractLinks.Reduce.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ArrayListWritable.class);

    configer.applyJobConfig(job);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    recursivelyAddInputPaths(job, inputPath);

    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    LOG.info("ExtractLinks");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - mapping file: " + mappingFile);
    LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false));

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.umd.cloud9.webgraph.TrecExtractLinks.java

License:Apache License

@Override
public int runTool() throws Exception {

    Configuration conf = getConf();
    conf.set("mapred.child.java.opts", "-Xmx3072m");
    conf.setInt("mapred.task.timeout", 60000000);
    Job job = new Job(conf);

    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");

    String mappingFile = conf.get("Cloud9.DocnoMappingFile");

    FileSystem fs = FileSystem.get(conf);
    if (!fs.exists(new Path(mappingFile))) {
        throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!");
    }/*  w w  w . j  av  a  2  s . c  om*/

    DistributedCache.addCacheFile(new Path(mappingFile).toUri(), job.getConfiguration());

    job.setJobName("ExtractLinks");
    job.setNumReduceTasks(numReducers);

    job.setJarByClass(TrecExtractLinks.class);
    job.setMapperClass(TrecExtractLinks.Map.class);
    job.setCombinerClass(TrecExtractLinks.Reduce.class);
    job.setReducerClass(TrecExtractLinks.Reduce.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ArrayListWritable.class);

    configer.applyJobConfig(job);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    recursivelyAddInputPaths(job, inputPath);

    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    LOG.info("ExtractLinks");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - mapping file: " + mappingFile);
    LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false));

    job.waitForCompletion(true);
    return 0;
}

From source file:eu.scape_project.tb.lsdr.seqfileutility.hadoop.HadoopJob.java

License:Apache License

/**
 * Run hadoop job//from ww w .j a v  a  2s . c  o m
 *
 * @param strings Command line arguments
 * @return Success indicator
 * @throws Exception
 */
@Override
public int run(String[] strings) throws Exception {
    try {
        String hdfsInputDir = null;
        FileSystem hdfs = FileSystem.get(conf);

        // hdfs input path is given as command parameter
        if (pc.getHdfsInputPath() != null) {
            hdfsInputDir = pc.getHdfsInputPath();
            // hdfs input file is created
        } else {
            hdfsInputDir = "input/" + System.currentTimeMillis() + "sfu/";

            String[] extensions = null;
            if (pc.getExtStr() != null) {
                StringTokenizer st = new StringTokenizer(pc.getExtStr(), ",");
                extensions = new String[st.countTokens()];
                int i = 0;
                while (st.hasMoreTokens()) {
                    extensions[i] = st.nextToken();
                    i++;
                }
            }

            hdfs.mkdirs(new Path(hdfsInputDir));

            String hdfsIinputPath = hdfsInputDir + "inputpaths.txt";
            Path path = new Path(hdfsIinputPath);

            FSDataOutputStream outputStream = hdfs.create(path);

            List<String> dirs = StringUtils.getStringListFromString(pc.getDirsStr(), ",");
            for (String dir : dirs) {
                File directory = new File(dir);
                if (directory.isDirectory()) {
                    // Alternatively, the java traverse method can be used
                    // for creating the file paths:
                    //traverse(directory, outputStream);
                    writeFilePaths(directory, outputStream);
                } else {
                    logger.warn("Parameter \"" + dir + "\" is not a directory " + "(skipped)");
                }
            }
            outputStream.close();
            if (hdfs.exists(path)) {
                logger.info(
                        "Input paths created in \"" + hdfs.getHomeDirectory() + "/" + path.toString() + "\"");
            } else {
                logger.error("Input paths have not been created in hdfs.");
                return 1;
            }
        }
        String hadoopJobName = "Hadoop_sequence_file_creation";
        if (pc.getHadoopJobName() != null && !pc.getHadoopJobName().equals(""))
            hadoopJobName = pc.getHadoopJobName();
        Job job = new Job(conf, hadoopJobName);

        job.setJarByClass(SequenceFileUtility.class);
        job.setMapperClass(SmallFilesSequenceFileMapper.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        TextInputFormat.addInputPath(job, new Path(hdfsInputDir));

        // todo: support absolute paths
        String hdfsOutputDir = pc.getOutputDirectory() != null ? pc.getOutputDirectory()
                : "output/" + System.currentTimeMillis() + "sfu/";

        SequenceFileOutputFormat.setOutputPath(job, new Path(hdfsOutputDir));
        SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.get(pc.getCompressionType()));

        int success = job.waitForCompletion(true) ? 0 : 1;
        boolean seqFileExists = hdfs.exists(new Path(hdfsOutputDir + "part-r-00000"));
        if (success == 0 && seqFileExists) {
            logger.info("Sequence file created: \""
                    //+ hdfs.getHomeDirectory() + "/"
                    + new Path(hdfsOutputDir).toString() + "/part-r-00000" + "\"");
            pc.setOutputDirectory(hdfsOutputDir);
            return 0;
        } else {
            logger.error("Sequence file not created in hdfs");
            return 1;
        }
    } catch (Exception e) {
        logger.error("IOException occurred", e);
    } finally {
    }
    return 0;
}

From source file:gaffer.accumulo.splitpoints.EstimateSplitPointsDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    if (args.length < 5) {
        System.err.println("Usage: " + this.getClass().getName()
                + " <mapred_output_directory> <proportion_to_sample> <number_of_tablet_servers> <resulting_split_file> <input_path1>...");
        return 1;
    }//from   www . ja  v  a 2  s  . co  m

    // Parse arguments
    Path outputPath = new Path(args[0]);
    float proportionToSample = Float.parseFloat(args[1]);
    int numberTabletServers = Integer.parseInt(args[2]);
    Path resultingSplitsFile = new Path(args[3]);
    Path[] inputPaths = new Path[args.length - 4];
    for (int i = 0; i < inputPaths.length; i++) {
        inputPaths[i] = new Path(args[i + 4]);
    }

    // Conf and job
    Configuration conf = getConf();
    conf.setFloat("proportion_to_sample", proportionToSample);
    String jobName = "Estimate split points: input = ";
    for (int i = 0; i < inputPaths.length; i++) {
        jobName += inputPaths[i] + ", ";
    }
    jobName += "output = " + outputPath;
    Job job = Job.getInstance(conf, jobName);
    job.setJarByClass(getClass());

    // Input
    job.setInputFormatClass(SequenceFileInputFormat.class);
    for (int i = 0; i < inputPaths.length; i++) {
        SequenceFileInputFormat.addInputPath(job, inputPaths[i]);
    }

    // Mapper
    job.setMapperClass(EstimateSplitPointsMapper.class);
    job.setMapOutputKeyClass(Key.class);
    job.setMapOutputValueClass(Value.class);

    // Reducer
    job.setReducerClass(EstimateSplitPointsReducer.class);
    job.setOutputKeyClass(Key.class);
    job.setOutputValueClass(Value.class);
    job.setNumReduceTasks(1);

    // Output
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, outputPath);
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    // Run job
    job.waitForCompletion(true);

    // Successful?
    if (!job.isSuccessful()) {
        System.err.println("Error running job");
        return 1;
    }

    // Number of records output
    // NB In the following line use mapred.Task.Counter.REDUCE_OUTPUT_RECORDS rather than
    // mapreduce.TaskCounter.REDUCE_OUTPUT_RECORDS as this is more compatible with earlier
    // versions of Hadoop.
    @SuppressWarnings("deprecation")
    Counter counter = job.getCounters()
            .findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_OUTPUT_RECORDS);
    long recordsOutput = counter.getValue();
    System.out.println("Number of records output = " + recordsOutput);

    // Work out when to output a split point. The number of split points
    // needed is the number of tablet servers minus 1 (because you don't
    // have to output the start of the first tablet or the end of the
    // last tablet).
    long outputEveryNthRecord = recordsOutput / (numberTabletServers - 1);

    // Read through resulting file, pick out the split points and write to
    // file.
    FileSystem fs = FileSystem.get(conf);
    Path resultsFile = new Path(outputPath, "part-r-00000");
    @SuppressWarnings("deprecation")
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, resultsFile, conf);
    PrintStream splitsWriter = new PrintStream(new BufferedOutputStream(fs.create(resultingSplitsFile, true)));
    Key key = new Key();
    Value value = new Value();
    long count = 0;
    int numberSplitPointsOutput = 0;
    while (reader.next(key, value) && numberSplitPointsOutput < numberTabletServers - 1) {
        count++;
        if (count % outputEveryNthRecord == 0) {
            numberSplitPointsOutput++;
            splitsWriter.println(new String(Base64.encodeBase64(key.getRow().getBytes())));
            System.out.println("Written split point: " + key.getRow());
        }
    }
    reader.close();
    splitsWriter.close();
    System.out.println("Number of split points output = " + numberSplitPointsOutput);
    return 0;
}

From source file:gaffer.accumulostore.operation.hdfs.handler.job.factory.SampleDataForSplitPointsJobFactory.java

License:Apache License

private void setupOutput(final Job job, final SampleDataForSplitPoints operation, final Store store)
        throws IOException {
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, new Path(operation.getOutputPath()));
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
}

From source file:gaffer.accumulostore.operation.hdfs.handler.job.SampleDataForSplitPointsJobFactory.java

License:Apache License

private void setupOutput(final Job job, final SampleDataForSplitPoints operation, final Store store)
        throws IOException {
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, operation.getOutputPath());
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
}

From source file:hadoop.TrainingDriver.java

License:Open Source License

public int run(String[] args) throws Exception {
    Configuration conf = getConf();

    String input = conf.get("gc.TrainingDriver.input");
    String output = conf.get("gc.TrainingDriver.output");
    String dataset = conf.get("gc.TrainingDriver.dataset");
    String jobname = conf.get("gc.TrainingDriver.name");

    if (input == null || output == null || dataset == null || jobname == null) {
        System.out.println(" Incorrect parameters ");
        System.exit(0);// w  w w . ja  va2  s  . c o m
    }

    conf = addPathToDC(conf, conf.get("gc.TrainingDriver.dataset") + "*");

    Job job = new Job(conf);
    job.setJarByClass(TrainingDriver.class);
    job.setJobName(jobname);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(WeightParameter.class);

    job.setMapperClass(TrainingDriverMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(WeightParameter.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD);
    job.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, new Path(output));

    System.out.println(" Input dir = " + input);
    System.out.println(" Output dir = " + output);
    System.out.println(" Training Input = " + dataset);
    System.out.println(" Name = " + jobname);

    if (job.waitForCompletion(true) == false) {
        System.err.println(" Job " + jobname + " Failed (miserably)");
        System.exit(2);
    }
    return 0;
}