List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setOutputCompressorClass
public static void setOutputCompressorClass(Job job, Class<? extends CompressionCodec> codecClass)
From source file:gov.jgi.meta.pig.storage.FastaOutput.java
License:Open Source License
public void setStoreLocation(String location, Job job) throws IOException { job.getConfiguration().set("mapred.textoutputformat.separator", ""); FileOutputFormat.setOutputPath(job, new Path(location)); if (location.endsWith(".bz2")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (location.endsWith(".gz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); }/*from w w w . j ava 2 s .c o m*/ }
From source file:gr.ntua.h2rdf.LoadTriples.DistinctIds.java
License:Open Source License
public Job createSubmittableJob(String[] args) throws IOException, ClassNotFoundException { //io.compression.codecs Job job = new Job(); job.setInputFormatClass(TextInputFormat.class); Configuration conf = new Configuration(); Path blockProjection = new Path("blockIds/"); Path translations = new Path("translations/"); Path sample = new Path("sample/"); Path temp = new Path("temp/"); Path uniqueIds = new Path("uniqueIds/"); FileSystem fs;/*from w w w . java 2s . co m*/ try { fs = FileSystem.get(conf); if (fs.exists(uniqueIds)) { fs.delete(uniqueIds, true); } if (fs.exists(translations)) { fs.delete(translations, true); } if (fs.exists(blockProjection)) { fs.delete(blockProjection, true); } if (fs.exists(sample)) { fs.delete(sample, true); } if (fs.exists(temp)) { fs.delete(temp, true); } FileOutputFormat.setOutputPath(job, uniqueIds); Path inp = new Path(args[0]); FileInputFormat.setInputPaths(job, inp); double type = 1; double datasetSize = 0; if (fs.isFile(inp)) { datasetSize = fs.getFileStatus(inp).getLen(); } else if (fs.isDirectory(inp)) { FileStatus[] s = fs.listStatus(inp); for (int i = 0; i < s.length; i++) { if (s[i].getPath().getName().toString().endsWith(".gz")) type = 27; if (s[i].getPath().getName().toString().endsWith(".snappy")) type = 10; datasetSize += s[i].getLen(); } } else { FileStatus[] s = fs.globStatus(inp); for (int i = 0; i < s.length; i++) { if (s[i].getPath().getName().toString().endsWith(".gz")) type = 27; if (s[i].getPath().getName().toString().endsWith(".snappy")) type = 10; datasetSize += s[i].getLen(); } } datasetSize = datasetSize * type; System.out.println("type: " + type); System.out.println("datasetSize: " + datasetSize); samplingRate = (double) sampleChunk / (double) datasetSize; if (samplingRate >= 0.1) { samplingRate = 0.1; } if (samplingRate <= 0.001) { samplingRate = 0.001; } numReducers = (int) (datasetSize / ReducerChunk); if (numReducers == 0) numReducers = 1; numReducers++; } catch (IOException e) { e.printStackTrace(); } HBaseAdmin hadmin = new HBaseAdmin(conf); HTableDescriptor desc = new HTableDescriptor(TABLE_NAME); HColumnDescriptor family = new HColumnDescriptor("counter"); desc.addFamily(family); if (!hadmin.tableExists(TABLE_NAME)) { hadmin.createTable(desc); } job.setNumReduceTasks(numReducers); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(ImmutableBytesWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setJarByClass(DistinctIds.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setPartitionerClass(SamplingPartitioner.class); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.getConfiguration().set("mapred.compress.map.output", "true"); job.getConfiguration().set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //job.setCombinerClass(Combiner.class); job.setJobName("Distinct Id Wordcount"); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().setInt("io.sort.mb", 100); job.getConfiguration().setInt("io.file.buffer.size", 131072); job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1); return job; }
From source file:gr.ntua.h2rdf.loadTriples.Translate.java
License:Apache License
public static Job createSubmittableJob(String[] args) throws IOException { Job job = new Job(); Configuration conf = job.getConfiguration(); FileSystem fs;/* www. j a v a 2s. co m*/ int reducers = 0; try { fs = FileSystem.get(conf); FileStatus[] p = fs.listStatus(new Path("blockIds/")); reducers = p.length; job.setInputFormatClass(SequenceFileInputFormat.class); job.setNumReduceTasks(reducers); Path out = new Path("translations"); if (fs.exists(out)) { fs.delete(out, true); } FileOutputFormat.setOutputPath(job, out); FileInputFormat.addInputPath(job, new Path("temp")); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(ImmutableBytesWritable.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(ImmutableBytesWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setJarByClass(Translate.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setPartitionerClass(IdPartitioner.class); job.setJobName("Translate"); job.getConfiguration().set("mapred.compress.map.output", "true"); job.getConfiguration().set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.getConfiguration().setInt("io.sort.mb", 100); job.getConfiguration().setInt("io.file.buffer.size", 131072); job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1); job.getConfiguration().setInt("hbase.hregion.max.filesize", 67108864); //job.getConfiguration().setInt("hbase.hregion.max.filesize", 33554432); } catch (IOException e) { e.printStackTrace(); } return job; }
From source file:nl.naward04.hadoop.country.Country.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); // Set compress type to compress BLOCKs (not RECORDs) // https://hadoop.apache.org/docs/r2.4.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml // http://hadoop.apache.org/docs/r2.4.0/api/org/apache/hadoop/io/SequenceFile.html conf.set(FileOutputFormat.COMPRESS_TYPE, "BLOCK"); Job job = Job.getInstance(conf, "Find the country based on domain name or IP address."); job.setJarByClass(Country.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(CountryLookup.class); job.setInputFormatClass(WarcInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Enable compression FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); // Execute job and return status return job.waitForCompletion(true) ? 0 : 1; }
From source file:nl.naward05.hadoop.MergeFiles.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); // Set compress type to compress BLOCKs (not RECORDs) // https://hadoop.apache.org/docs/r2.4.0/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml // http://hadoop.apache.org/docs/r2.4.0/api/org/apache/hadoop/io/SequenceFile.html conf.set(FileOutputFormat.COMPRESS_TYPE, "BLOCK"); Job job = Job.getInstance(conf, "Merge countries and songs"); job.setJarByClass(MergeFiles.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileInputFormat.addInputPath(job, new Path(args[1])); FileOutputFormat.setOutputPath(job, new Path(args[2])); job.setReducerClass(MergeReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); // Enable compression FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); // Execute job and return status return job.waitForCompletion(true) ? 0 : 1; }
From source file:nl.surfsara.warcexamples.hadoop.rr.RR.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = this.getConf(); conf.set(FileOutputFormat.COMPRESS_TYPE, "BLOCK"); Job job = Job.getInstance(conf, "Record Recognizer"); job.setJarByClass(RR.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(RRMapper.class); job.setInputFormatClass(WarcInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // job.setOutputValueClass(LongWritable.class); // job.setReducerClass(LongSumReducer.class); // Enable compression FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); // Execute job and return status return job.waitForCompletion(true) ? 0 : 1; }
From source file:nl.utwente.mirex.AnchorExtract.java
License:Open Source License
/** * Runs the MapReduce job "anchor text extraction" * @param args 0: path to web collection on HDFS; 1: (non-existing) path that will contain anchor texts * @usage. //from w w w. j a v a 2 s.c o m * <code> hadoop jar mirex-0.2.jar nl.utwente.mirex.AnchorExtract /user/hadoop/ClueWeb09_English/*/ /user/hadoop/ClueWeb09_Anchors </code> */ public static void main(String[] args) throws Exception { // Set job configuration Configuration conf = new Configuration(); conf.setLong("mapred.task.timeout", 1800 * 1000L); // 30 minutes timeout Job job = new Job(conf, "AnchorExtract"); job.setJarByClass(AnchorExtract.class); if (args.length != 2) { System.out.printf("Usage: %s inputFiles outputFile\n", AnchorExtract.class.getSimpleName()); System.out.println(" inputFiles: path to data"); System.out.println(" outputFile: directory where anchor text is stored"); System.exit(1); } int argc = 0; String inputFiles = args[argc++]; String outputFile = args[argc++]; job.setMapperClass(Map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setCombinerClass(Combine.class); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(WarcFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputFiles)); // '(conf, args[0])' to accept comma-separated list. FileOutputFormat.setOutputPath(job, new Path(outputFile)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); job.waitForCompletion(true); }
From source file:nthu.scopelab.tsqr.ssvd.VJob.java
License:Apache License
public void start(Configuration conf, Path inputPathBt, Path inputUHatPath, Path inputSigmaPath, Path outputPath, int k, int numReduceTasks, int subRowSize, boolean vHalfSigma, int mis) throws ClassNotFoundException, InterruptedException, IOException { job = new Job(conf); job.setJobName("V-job"); job.setJarByClass(VJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, inputPathBt); FileSystem fs = FileSystem.get(job.getConfiguration()); fileGather fgather = new fileGather(inputPathBt, "", fs); mis = Checker.checkMis(mis, fgather.getInputSize(), fs); FileInputFormat.setMaxInputSplitSize(job, mis * 1024 * 1024); FileOutputFormat.setOutputPath(job, outputPath); // Warn: tight hadoop integration here: job.getConfiguration().set("mapreduce.output.basename", OUTPUT_V); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(LMatrixWritable.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(LMatrixWritable.class); job.setMapperClass(VMapper.class); job.getConfiguration().set(PROP_UHAT_PATH, inputUHatPath.toString()); job.getConfiguration().set(PROP_SIGMA_PATH, inputSigmaPath.toString()); if (vHalfSigma) { job.getConfiguration().set(PROP_V_HALFSIGMA, "y"); }/* w w w . j a va 2 s . c om*/ job.getConfiguration().setInt(QJob.PROP_K, k); job.getConfiguration().setInt(SUB_ROW_SIZE, subRowSize); job.setNumReduceTasks(0); job.submit(); //job.waitForCompletion(true); }
From source file:org.apache.jena.grande.pig.RdfStorage.java
License:Apache License
@SuppressWarnings("unchecked") @Override//from w w w . j a v a 2 s. co m public void setStoreLocation(String location, Job job) throws IOException { log.debug("setStoreLocation({}, {})", location, job); job.getConfiguration().set("mapred.textoutputformat.separator", ""); FileOutputFormat.setOutputPath(job, new Path(location)); if ("true".equals(job.getConfiguration().get("output.compression.enabled"))) { FileOutputFormat.setCompressOutput(job, true); String codec = job.getConfiguration().get("output.compression.codec"); try { FileOutputFormat.setOutputCompressorClass(job, (Class<? extends CompressionCodec>) Class.forName(codec)); } catch (ClassNotFoundException e) { throw new RuntimeException("Class not found: " + codec); } } else { if (location.endsWith(".bz2") || location.endsWith(".bz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (location.endsWith(".gz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { FileOutputFormat.setCompressOutput(job, false); } } }
From source file:org.apache.jena.hadoop.rdf.stats.jobs.JobFactory.java
License:Apache License
/** * Gets a sequence of jobs that can be used to compute characteristic sets * for RDF triples//from ww w.ja v a 2s . c o m * * @param config * Configuration * @param inputPaths * Input paths * @param intermediateOutputPath * Intermediate output path * @param outputPath * Final output path * @return Sequence of jobs * @throws IOException */ public static Job[] getTripleCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, String outputPath) throws IOException { Job[] jobs = new Job[2]; Job job = Job.getInstance(config); job.setJarByClass(JobFactory.class); job.setJobName("RDF Triples Characteristic Set (Generation)"); // Map/Reduce classes job.setMapperClass(TripleGroupBySubjectMapper.class); job.setMapOutputKeyClass(NodeWritable.class); job.setMapOutputValueClass(TripleWritable.class); job.setReducerClass(TripleCharacteristicSetGeneratingReducer.class); job.setOutputKeyClass(CharacteristicSetWritable.class); job.setOutputValueClass(NullWritable.class); // Input and Output job.setInputFormatClass(TriplesInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); SequenceFileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); jobs[0] = job; job = Job.getInstance(config); job.setJarByClass(JobFactory.class); job.setJobName("RDF Triples Characteristic Set (Reduction)"); // Map/Reduce classes job.setMapperClass(KeyMapper.class); job.setMapOutputKeyClass(CharacteristicSetWritable.class); job.setMapOutputValueClass(CharacteristicSetWritable.class); job.setReducerClass(CharacteristicSetReducer.class); job.setOutputKeyClass(CharacteristicSetWritable.class); job.setOutputValueClass(CharacteristicSetWritable.class); // Input and Output job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, intermediateOutputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); jobs[1] = job; return jobs; }