List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setCompressOutput
public static void setCompressOutput(Job job, boolean compress)
From source file:my.mahout.SequenceFilesFromDirectory.java
License:Apache License
private int runMapReduce(Path input, Path output) throws IOException, ClassNotFoundException, InterruptedException { int chunkSizeInMB = 64; if (hasOption(CHUNK_SIZE_OPTION[0])) { chunkSizeInMB = Integer.parseInt(getOption(CHUNK_SIZE_OPTION[0])); }// ww w. j a v a2 s . c o m String keyPrefix = null; if (hasOption(KEY_PREFIX_OPTION[0])) { keyPrefix = getOption(KEY_PREFIX_OPTION[0]); } String fileFilterClassName = null; if (hasOption(FILE_FILTER_CLASS_OPTION[0])) { fileFilterClassName = getOption(FILE_FILTER_CLASS_OPTION[0]); } PathFilter pathFilter = null; // Prefix Addition is presently handled in the Mapper and unlike // runsequential() // need not be done via a pathFilter if (!StringUtils.isBlank(fileFilterClassName) && !PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) { try { pathFilter = (PathFilter) Class.forName(fileFilterClassName).newInstance(); } catch (InstantiationException e) { throw new IllegalStateException(e); } catch (IllegalAccessException e) { throw new IllegalStateException(e); } } // Prepare Job for submission. Job job = prepareJob(input, output, MultipleTextFileInputFormat.class, SequenceFilesFromDirectoryMapper.class, Text.class, Text.class, SequenceFileOutputFormat.class, "SequenceFilesFromDirectory"); Configuration jobConfig = job.getConfiguration(); jobConfig.set(KEY_PREFIX_OPTION[0], keyPrefix); jobConfig.set(FILE_FILTER_CLASS_OPTION[0], fileFilterClassName); FileSystem fs = FileSystem.get(jobConfig); FileStatus fsFileStatus = fs.getFileStatus(input); String inputDirList; if (pathFilter != null) { inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus, pathFilter); } else { inputDirList = HadoopUtil.buildDirList(fs, fsFileStatus); } jobConfig.set(BASE_INPUT_PATH, input.toString()); long chunkSizeInBytes = chunkSizeInMB * 1024 * 1024; // set the max split locations, otherwise we get nasty debug stuff jobConfig.set("mapreduce.job.max.split.locations", String.valueOf(MAX_JOB_SPLIT_LOCATIONS)); FileInputFormat.setInputPaths(job, inputDirList); // need to set this to a multiple of the block size, or no split happens FileInputFormat.setMaxInputSplitSize(job, chunkSizeInBytes); FileOutputFormat.setCompressOutput(job, true); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { return -1; } return 0; }
From source file:nl.naward04.hadoop.country.Country.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); // Set compress type to compress BLOCKs (not RECORDs) // https://hadoop.apache.org/docs/r2.4.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml // http://hadoop.apache.org/docs/r2.4.0/api/org/apache/hadoop/io/SequenceFile.html conf.set(FileOutputFormat.COMPRESS_TYPE, "BLOCK"); Job job = Job.getInstance(conf, "Find the country based on domain name or IP address."); job.setJarByClass(Country.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(CountryLookup.class); job.setInputFormatClass(WarcInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Enable compression FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); // Execute job and return status return job.waitForCompletion(true) ? 0 : 1; }
From source file:nl.naward05.hadoop.MergeFiles.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); // Set compress type to compress BLOCKs (not RECORDs) // https://hadoop.apache.org/docs/r2.4.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml // http://hadoop.apache.org/docs/r2.4.0/api/org/apache/hadoop/io/SequenceFile.html conf.set(FileOutputFormat.COMPRESS_TYPE, "BLOCK"); Job job = Job.getInstance(conf, "Merge countries and songs"); job.setJarByClass(MergeFiles.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileInputFormat.addInputPath(job, new Path(args[1])); FileOutputFormat.setOutputPath(job, new Path(args[2])); job.setReducerClass(MergeReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // Enable compression FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); // Execute job and return status return job.waitForCompletion(true) ? 0 : 1; }
From source file:nl.surfsara.warcexamples.hadoop.rr.RR.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); conf.set(FileOutputFormat.COMPRESS_TYPE, "BLOCK"); Job job = Job.getInstance(conf, "Record Recognizer"); job.setJarByClass(RR.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(RRMapper.class); job.setInputFormatClass(WarcInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // job.setOutputValueClass(LongWritable.class); // job.setReducerClass(LongSumReducer.class); // Enable compression FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); // Execute job and return status return job.waitForCompletion(true) ? 0 : 1; }
From source file:nl.utwente.bigdata.TemplateTool.java
License:Apache License
public void run(String inputPath, String outPath) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf);// w w w .j av a2 s . c o m job.setJarByClass(TemplateTool.class); job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath)); // -- check if output directory already exists; and optionally delete String outputAlreadyExistsOption = "exit"; Path outDir = new Path(outPath); if (FileSystem.get(conf).exists(outDir)) { if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) { FileSystem.get(conf).delete(outDir, true); } else { System.err.println("Directory " + outPath + " already exists; exiting"); System.exit(1); } } // ---- Input (Format) Options String inputFormat = "text"; if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(TextInputFormat.class); } else if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(SequenceFileInputFormat.class); } // Utils.recursivelyAddInputPaths(job, new Path(inputPath)); FileInputFormat.addInputPath(job, new Path(inputPath)); // Add files that should be available localy at each mapper // Utils.addCacheFiles(job, new String[] { }); // ---- Mapper job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(MyMapper.KOUT); job.setMapOutputValueClass(MyMapper.VOUT); // ---- Combiner job.setCombinerClass(MyCombiner.class); // ---- Partitioner // job.setPartitionerClass(MyPartitioner.class); // ---- Reducer // set the number of reducers to influence the number of output files // job.setNumReduceTasks(100); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(MyReducer.KOUT); job.setOutputValueClass(MyReducer.VOUT); // ---- Output Options String outputFormat = "text"; if (outputFormat.equalsIgnoreCase("sequence")) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("text")) { job.setOutputFormatClass(TextOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("null")) { job.setOutputFormatClass(NullOutputFormat.class); } FileOutputFormat.setOutputPath(job, outDir); FileOutputFormat.setCompressOutput(job, false); // ---- Start job job.waitForCompletion(true); return; }
From source file:nl.utwente.mirex.AnchorExtract.java
License:Open Source License
/** * Runs the MapReduce job "anchor text extraction" * @param args 0: path to web collection on HDFS; 1: (non-existing) path that will contain anchor texts * @usage. //from w w w . ja v a2 s . co m * <code> hadoop jar mirex-0.2.jar nl.utwente.mirex.AnchorExtract /user/hadoop/ClueWeb09_English/*/ /user/hadoop/ClueWeb09_Anchors </code> */ public static void main(String[] args) throws Exception { // Set job configuration Configuration conf = new Configuration(); conf.setLong("mapred.task.timeout", 1800 * 1000L); // 30 minutes timeout Job job = new Job(conf, "AnchorExtract"); job.setJarByClass(AnchorExtract.class); if (args.length != 2) { System.out.printf("Usage: %s inputFiles outputFile\n", AnchorExtract.class.getSimpleName()); System.out.println(" inputFiles: path to data"); System.out.println(" outputFile: directory where anchor text is stored"); System.exit(1); } int argc = 0; String inputFiles = args[argc++]; String outputFile = args[argc++]; job.setMapperClass(Map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setCombinerClass(Combine.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(WarcFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputFiles)); // '(conf, args[0])' to accept comma-separated list. FileOutputFormat.setOutputPath(job, new Path(outputFile)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.waitForCompletion(true); }
From source file:nl.utwente.trafficanalyzer.CarCount.java
License:Apache License
public void run(String inputPath, String outPath) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf);/*from w w w . ja v a2 s. c o m*/ job.setJarByClass(CarCount.class); job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath)); // -- check if output directory already exists; and optionally delete String outputAlreadyExistsOption = "exit"; Path outDir = new Path(outPath); if (FileSystem.get(conf).exists(outDir)) { if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) { FileSystem.get(conf).delete(outDir, true); } else { System.err.println("Directory " + outPath + " already exists; exiting"); System.exit(1); } } // ---- Input (Format) Options String inputFormat = "text"; if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(TextInputFormat.class); } else if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(SequenceFileInputFormat.class); } // Utils.recursivelyAddInputPaths(job, new Path(inputPath)); FileInputFormat.addInputPath(job, new Path(inputPath)); // Add files that should be available localy at each mapper // Utils.addCacheFiles(job, new String[] { }); // ---- Mapper job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(MyMapper.KOUT); job.setMapOutputValueClass(MyMapper.VOUT); // ---- Combiner //job.setCombinerClass(MyCombiner.class); // ---- Partitioner // job.setPartitionerClass(MyPartitioner.class); // ---- Reducer // set the number of reducers to influence the number of output files // job.setNumReduceTasks(100); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(MyReducer.KOUT); job.setOutputValueClass(MyReducer.VOUT); // ---- Output Options String outputFormat = "text"; if (outputFormat.equalsIgnoreCase("sequence")) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("text")) { job.setOutputFormatClass(TextOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("null")) { job.setOutputFormatClass(NullOutputFormat.class); } FileOutputFormat.setOutputPath(job, outDir); FileOutputFormat.setCompressOutput(job, false); // ---- Start job job.waitForCompletion(true); return; }
From source file:nl.utwente.trafficanalyzer.CarCountPerRoadPerDay.java
License:Apache License
public void run(String inputPath, String outPath) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf);//from ww w . j ava 2 s .c o m job.setJarByClass(CarCountPerRoadPerDay.class); job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath)); // -- check if output directory already exists; and optionally delete String outputAlreadyExistsOption = "exit"; Path outDir = new Path(outPath); if (FileSystem.get(conf).exists(outDir)) { if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) { FileSystem.get(conf).delete(outDir, true); } else { System.err.println("Directory " + outPath + " already exists; exiting"); System.exit(1); } } // ---- Input (Format) Options String inputFormat = "text"; if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(TextInputFormat.class); } else if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(SequenceFileInputFormat.class); } // Utils.recursivelyAddInputPaths(job, new Path(inputPath)); FileInputFormat.addInputPath(job, new Path(inputPath)); // Add files that should be available localy at each mapper // Utils.addCacheFiles(job, new String[] { }); // ---- Mapper job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(MyMapper.KOUT); job.setMapOutputValueClass(MyMapper.VOUT); // ---- Combiner job.setCombinerClass(MyCombiner.class); // ---- Partitioner // job.setPartitionerClass(MyPartitioner.class); // ---- Reducer // set the number of reducers to influence the number of output files job.setNumReduceTasks(1); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(MyReducer.KOUT); job.setOutputValueClass(MyReducer.VOUT); // ---- Output Options String outputFormat = "text"; if (outputFormat.equalsIgnoreCase("sequence")) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("text")) { job.setOutputFormatClass(TextOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("null")) { job.setOutputFormatClass(NullOutputFormat.class); } FileOutputFormat.setOutputPath(job, outDir); FileOutputFormat.setCompressOutput(job, false); // ---- Start job job.waitForCompletion(true); return; }
From source file:nl.utwente.trafficanalyzer.CarCountPerRoadPerDayIncreasedValidity.java
License:Apache License
public void run(String inputPath, String outPath) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf);/*from ww w. j a v a2 s . c o m*/ job.setJarByClass(CarCountPerRoadPerDayIncreasedValidity.class); job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath)); // -- check if output directory already exists; and optionally delete String outputAlreadyExistsOption = "exit"; Path outDir = new Path(outPath); if (FileSystem.get(conf).exists(outDir)) { if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) { FileSystem.get(conf).delete(outDir, true); } else { System.err.println("Directory " + outPath + " already exists; exiting"); System.exit(1); } } // ---- Input (Format) Options String inputFormat = "text"; if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(TextInputFormat.class); } else if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(SequenceFileInputFormat.class); } // Utils.recursivelyAddInputPaths(job, new Path(inputPath)); FileInputFormat.addInputPath(job, new Path(inputPath)); // Add files that should be available localy at each mapper // Utils.addCacheFiles(job, new String[] { }); // ---- Mapper job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(MyMapper.KOUT); job.setMapOutputValueClass(MyMapper.VOUT); // ---- Combiner job.setCombinerClass(MyCombiner.class); // ---- Partitioner // job.setPartitionerClass(MyPartitioner.class); // ---- Reducer // set the number of reducers to influence the number of output files job.setNumReduceTasks(1); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(MyReducer.KOUT); job.setOutputValueClass(MyReducer.VOUT); // ---- Output Options String outputFormat = "text"; if (outputFormat.equalsIgnoreCase("sequence")) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("text")) { job.setOutputFormatClass(TextOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("null")) { job.setOutputFormatClass(NullOutputFormat.class); } FileOutputFormat.setOutputPath(job, outDir); FileOutputFormat.setCompressOutput(job, false); // ---- Start job job.waitForCompletion(true); return; }
From source file:nl.utwente.trafficanalyzer.ReadingsPerSensor.java
License:Apache License
public void run(String inputPath, String outPath) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf);//from w ww . ja v a 2 s .c o m job.setJarByClass(ReadingsPerSensor.class); job.setJobName(String.format("%s [%s, %s]", this.getClass().getName(), inputPath, outPath)); // -- check if output directory already exists; and optionally delete String outputAlreadyExistsOption = "exit"; Path outDir = new Path(outPath); if (FileSystem.get(conf).exists(outDir)) { if (outputAlreadyExistsOption.equalsIgnoreCase("delete")) { FileSystem.get(conf).delete(outDir, true); } else { System.err.println("Directory " + outPath + " already exists; exiting"); System.exit(1); } } // ---- Input (Format) Options String inputFormat = "text"; if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(TextInputFormat.class); } else if (inputFormat.equalsIgnoreCase("text")) { job.setInputFormatClass(SequenceFileInputFormat.class); } // Utils.recursivelyAddInputPaths(job, new Path(inputPath)); FileInputFormat.addInputPath(job, new Path(inputPath)); // Add files that should be available localy at each mapper // Utils.addCacheFiles(job, new String[] { }); // ---- Mapper job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(MyMapper.KOUT); job.setMapOutputValueClass(MyMapper.VOUT); // ---- Combiner job.setCombinerClass(MyCombiner.class); // ---- Partitioner // job.setPartitionerClass(MyPartitioner.class); // ---- Reducer // set the number of reducers to influence the number of output files job.setNumReduceTasks(1); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(MyReducer.KOUT); job.setOutputValueClass(MyReducer.VOUT); // ---- Output Options String outputFormat = "text"; if (outputFormat.equalsIgnoreCase("sequence")) { job.setOutputFormatClass(SequenceFileOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("text")) { job.setOutputFormatClass(TextOutputFormat.class); } else if (outputFormat.equalsIgnoreCase("null")) { job.setOutputFormatClass(NullOutputFormat.class); } FileOutputFormat.setOutputPath(job, outDir); FileOutputFormat.setCompressOutput(job, false); // ---- Start job job.waitForCompletion(true); return; }