Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setCompressOutput

List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setCompressOutput

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setCompressOutput.

Prototype

public static void setCompressOutput(Job job, boolean compress) 

Source Link

Document

Set whether the output of the job is compressed.

Usage

From source file:my.mahout.SequenceFilesFromDirectory.java

License:Apache License

private int runMapReduce(Path input, Path output)
        throws IOException, ClassNotFoundException, InterruptedException {

    int chunkSizeInMB = 64;
    if (hasOption(CHUNK_SIZE_OPTION[0])) {
        chunkSizeInMB = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0]));
    }//  ww  w. j a v a2  s . c  o  m

    String keyPrefix = null;
    if (hasOption(KEY_PREFIX_OPTION[0])) {
        keyPrefix = getOption(KEY_PREFIX_OPTION[0]);
    }

    String fileFilterClassName = null;
    if (hasOption(FILE_FILTER_CLASS_OPTION[0])) {
        fileFilterClassName = getOption(FILE_FILTER_CLASS_OPTION[0]);
    }

    PathFilter pathFilter = null;
    // Prefix Addition is presently handled in the Mapper and unlike
    // runsequential()
    // need not be done via a pathFilter
    if (!StringUtils.isBlank(fileFilterClassName)
            && !PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
        try {
            pathFilter = (PathFilter) Class.forName(fileFilterClassName).newInstance();
        } catch (InstantiationException e) {
            throw new IllegalStateException(e);
        } catch (IllegalAccessException e) {
            throw new IllegalStateException(e);
        }
    }

    // Prepare Job for submission.
    Job job = prepareJob(input, output, MultipleTextFileInputFormat.class,
            SequenceFilesFromDirectoryMapper.class, Text.class, Text.class, SequenceFileOutputFormat.class,
            "SequenceFilesFromDirectory");

    Configuration jobConfig = job.getConfiguration();
    jobConfig.set(KEY_PREFIX_OPTION[0], keyPrefix);
    jobConfig.set(FILE_FILTER_CLASS_OPTION[0], fileFilterClassName);

    FileSystem fs = FileSystem.get(jobConfig);
    FileStatus fsFileStatus = fs.getFileStatus(input);

    String inputDirList;
    if (pathFilter != null) {
        inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus, pathFilter);
    } else {
        inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus);
    }

    jobConfig.set(BASE_INPUT_PATH, input.toString());

    long chunkSizeInBytes = chunkSizeInMB * 1024 * 1024;

    // set the max split locations, otherwise we get nasty debug stuff
    jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS));

    FileInputFormat.setInputPaths(job, inputDirList);
    // need to set this to a multiple of the block size, or no split happens
    FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes);
    FileOutputFormat.setCompressOutput(job, true);

    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        return -1;
    }
    return 0;
}

From source file:nl.naward04.hadoop.country.Country.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = this.getConf();

    // Set compress type to compress BLOCKs (not RECORDs)
    // https://hadoop.apache.org/docs/r2.4.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml
    // http://hadoop.apache.org/docs/r2.4.0/api/org/apache/hadoop/io/SequenceFile.html
    conf.set(FileOutputFormat.COMPRESS_TYPE, "BLOCK");

    Job job = Job.getInstance(conf, "Find the country based on domain name or IP address.");
    job.setJarByClass(Country.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(CountryLookup.class);
    job.setInputFormatClass(WarcInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Enable compression
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    // Execute job and return status
    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:nl.naward05.hadoop.MergeFiles.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = this.getConf();

    // Set compress type to compress BLOCKs (not RECORDs)
    // https://hadoop.apache.org/docs/r2.4.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml
    // http://hadoop.apache.org/docs/r2.4.0/api/org/apache/hadoop/io/SequenceFile.html
    conf.set(FileOutputFormat.COMPRESS_TYPE, "BLOCK");

    Job job = Job.getInstance(conf, "Merge countries and songs");
    job.setJarByClass(MergeFiles.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileInputFormat.addInputPath(job, new Path(args[1]));
    FileOutputFormat.setOutputPath(job, new Path(args[2]));

    job.setReducerClass(MergeReducer.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    // Enable compression
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    // Execute job and return status
    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:nl.surfsara.warcexamples.hadoop.rr.RR.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration conf = this.getConf();

    conf.set(FileOutputFormat.COMPRESS_TYPE, "BLOCK");

    Job job = Job.getInstance(conf, "Record Recognizer");
    job.setJarByClass(RR.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(RRMapper.class);
    job.setInputFormatClass(WarcInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    //      job.setOutputValueClass(LongWritable.class);   
    //      job.setReducerClass(LongSumReducer.class);

    // Enable compression
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    // Execute job and return status
    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:nl.utwente.bigdata.TemplateTool.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);// w w  w .j av  a2 s . c  o m
    job.setJarByClass(TemplateTool.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);

    // ---- Reducer
    // set the number of reducers to influence the number of output files
    // job.setNumReduceTasks(100);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}

From source file:nl.utwente.mirex.AnchorExtract.java

License:Open Source License

/**
 * Runs the MapReduce job "anchor text extraction"
 * @param args 0: path to web collection on HDFS; 1: (non-existing) path that will contain anchor texts
 * @usage. //from  w w  w  .  ja  v a2  s  .  co m
 * <code> hadoop jar mirex-0.2.jar nl.utwente.mirex.AnchorExtract /user/hadoop/ClueWeb09_English/&#x2a;/ /user/hadoop/ClueWeb09_Anchors </code> 
 */
public static void main(String[] args) throws Exception {
    // Set job configuration
    Configuration conf = new Configuration();
    conf.setLong("mapred.task.timeout", 1800 * 1000L); // 30 minutes timeout
    Job job = new Job(conf, "AnchorExtract");
    job.setJarByClass(AnchorExtract.class);

    if (args.length != 2) {
        System.out.printf("Usage: %s inputFiles outputFile\n", AnchorExtract.class.getSimpleName());
        System.out.println("          inputFiles: path to data");
        System.out.println("          outputFile: directory where anchor text is stored");
        System.exit(1);
    }
    int argc = 0;
    String inputFiles = args[argc++];
    String outputFile = args[argc++];

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setCombinerClass(Combine.class);

    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setInputFormatClass(WarcFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.setInputPaths(job, new Path(inputFiles)); // '(conf, args[0])' to accept comma-separated list.
    FileOutputFormat.setOutputPath(job, new Path(outputFile));
    FileOutputFormat.setCompressOutput(job, true);
    FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);

    job.waitForCompletion(true);
}

From source file:nl.utwente.trafficanalyzer.CarCount.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);/*from  w w w . ja v  a2  s. c o m*/
    job.setJarByClass(CarCount.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    //job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);

    // ---- Reducer
    // set the number of reducers to influence the number of output files
    // job.setNumReduceTasks(100);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}

From source file:nl.utwente.trafficanalyzer.CarCountPerRoadPerDay.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);//from   ww w . j  ava  2  s  .c  o  m
    job.setJarByClass(CarCountPerRoadPerDay.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);

    // ---- Reducer
    // set the number of reducers to influence the number of output files
    job.setNumReduceTasks(1);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}

From source file:nl.utwente.trafficanalyzer.CarCountPerRoadPerDayIncreasedValidity.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);/*from  ww w. j a v a2 s . c o  m*/
    job.setJarByClass(CarCountPerRoadPerDayIncreasedValidity.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);
    // ---- Reducer
    // set the number of reducers to influence the number of output files
    job.setNumReduceTasks(1);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}

From source file:nl.utwente.trafficanalyzer.ReadingsPerSensor.java

License:Apache License

public void run(String inputPath, String outPath) throws Exception {
    Configuration conf = getConf();
    Job job = Job.getInstance(conf);//from   w ww  .  ja v  a  2  s  .c o  m
    job.setJarByClass(ReadingsPerSensor.class);
    job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath));

    // -- check if output directory already exists; and optionally delete
    String outputAlreadyExistsOption = "exit";
    Path outDir = new Path(outPath);
    if (FileSystem.get(conf).exists(outDir)) {
        if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) {
            FileSystem.get(conf).delete(outDir, true);
        } else {
            System.err.println("Directory " + outPath + " already exists; exiting");
            System.exit(1);
        }
    }

    // ---- Input (Format) Options
    String inputFormat = "text";
    if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(TextInputFormat.class);
    } else if (inputFormat.equalsIgnoreCase("text")) {
        job.setInputFormatClass(SequenceFileInputFormat.class);
    }
    // Utils.recursivelyAddInputPaths(job, new Path(inputPath));
    FileInputFormat.addInputPath(job, new Path(inputPath));
    // Add files that should be available localy at each mapper
    // Utils.addCacheFiles(job, new String[] { });

    // ---- Mapper
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(MyMapper.KOUT);
    job.setMapOutputValueClass(MyMapper.VOUT);

    // ---- Combiner
    job.setCombinerClass(MyCombiner.class);

    // ---- Partitioner
    // job.setPartitionerClass(MyPartitioner.class);

    // ---- Reducer
    // set the number of reducers to influence the number of output files
    job.setNumReduceTasks(1);
    job.setReducerClass(MyReducer.class);
    job.setOutputKeyClass(MyReducer.KOUT);
    job.setOutputValueClass(MyReducer.VOUT);

    // ---- Output Options
    String outputFormat = "text";
    if (outputFormat.equalsIgnoreCase("sequence")) {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("text")) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else if (outputFormat.equalsIgnoreCase("null")) {
        job.setOutputFormatClass(NullOutputFormat.class);
    }
    FileOutputFormat.setOutputPath(job, outDir);
    FileOutputFormat.setCompressOutput(job, false);

    // ---- Start job
    job.waitForCompletion(true);
    return;
}