List of usage examples for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType
public static void setOutputCompressionType(Job job, CompressionType style)
From source file:org.apache.mahout.math.hadoop.stochasticsvd.YtYJob.java
License:Apache License
public static void run(Configuration conf, Path[] inputPaths, Path outputPath, int k, int p, long seed) throws ClassNotFoundException, InterruptedException, IOException { Job job = new Job(conf); job.setJobName("YtY-job"); job.setJarByClass(YtYJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, inputPaths); FileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(YtYMapper.class); job.getConfiguration().setLong(PROP_OMEGA_SEED, seed); job.getConfiguration().setInt(PROP_K, k); job.getConfiguration().setInt(PROP_P, p); /*//from w w w . j a va 2 s . c om * we must reduce to just one matrix which means we need only one reducer. * But it's ok since each mapper outputs only one vector (a packed * UpperTriangular) so even if there're thousands of mappers, one reducer * should cope just fine. */ job.setNumReduceTasks(1); job.submit(); job.waitForCompletion(false); if (!job.isSuccessful()) { throw new IOException("YtY job unsuccessful."); } }
From source file:org.apache.sqoop.mapreduce.ImportJobBase.java
License:Apache License
/** * Configure the output format to use for the job. *//*w w w.ja v a2 s.c om*/ @Override protected void configureOutputFormat(Job job, String tableName, String tableClassName) throws ClassNotFoundException, IOException { job.setOutputFormatClass(getOutputFormatClass()); if (isHCatJob) { LOG.debug("Configuring output format for HCatalog import job"); SqoopHCatUtilities.configureImportOutputFormat(options, job, getContext().getConnManager(), tableName, job.getConfiguration()); return; } if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) { job.getConfiguration().set("mapred.output.value.class", tableClassName); } if (options.shouldUseCompression()) { FileOutputFormat.setCompressOutput(job, true); String codecName = options.getCompressionCodec(); Class<? extends CompressionCodec> codecClass; if (codecName == null) { codecClass = GzipCodec.class; } else { Configuration conf = job.getConfiguration(); codecClass = CodecMap.getCodec(codecName, conf).getClass(); } FileOutputFormat.setOutputCompressorClass(job, codecClass); if (options.getFileLayout() == SqoopOptions.FileLayout.SequenceFile) { SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); } // SQOOP-428: Avro expects not a fully qualified class name but a "short" // name instead (e.g. "snappy") and it needs to be set in a custom // configuration option called "avro.output.codec". // The default codec is "deflate". if (options.getFileLayout() == SqoopOptions.FileLayout.AvroDataFile) { if (codecName != null) { String shortName = CodecMap.getCodecShortNameByName(codecName, job.getConfiguration()); // Avro only knows about "deflate" and not "default" if (shortName.equalsIgnoreCase("default")) { shortName = "deflate"; } job.getConfiguration().set(AvroJob.OUTPUT_CODEC, shortName); } else { job.getConfiguration().set(AvroJob.OUTPUT_CODEC, DataFileConstants.DEFLATE_CODEC); } } } Path outputPath = context.getDestination(); FileOutputFormat.setOutputPath(job, outputPath); }
From source file:org.opensextant.mapreduce.XponentsTaggerDemo.java
License:Apache License
/** * Returns 0 = SUCCESS, other than 0 is a FAILURE mode. * //from w w w. jav a 2s. c om * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ @Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { /* * Run: * /input/path /output/path phase CC yyyymmdd:yyyymmdd * * phase = geotag | xtax */ LongOpt[] options = { new LongOpt("in", LongOpt.REQUIRED_ARGUMENT, null, 'i'), new LongOpt("out", LongOpt.REQUIRED_ARGUMENT, null, 'o'), new LongOpt("phase", LongOpt.REQUIRED_ARGUMENT, null, 'p'), new LongOpt("cc", LongOpt.REQUIRED_ARGUMENT, null, 'c'), new LongOpt("date-range", LongOpt.REQUIRED_ARGUMENT, null, 'd'), new LongOpt("utc-range", LongOpt.REQUIRED_ARGUMENT, null, 'u'), new LongOpt("log4j-extra-config", LongOpt.REQUIRED_ARGUMENT, null, 'L') }; gnu.getopt.Getopt opts = new gnu.getopt.Getopt("socgeo", args, "", options); String inPath = null; String outPath = null; String phase = null; String cc = null; String dateRange = null; String utcRange = null; String xponentsArchive = null; String log4jExtraConfig = null; System.out.println(Arrays.toString(args)); int c; while ((c = opts.getopt()) != -1) { switch (c) { case 0: // 0 = Long opt processed. break; case 'i': inPath = opts.getOptarg(); break; case 'o': outPath = opts.getOptarg(); break; case 'p': phase = opts.getOptarg(); break; case 'c': cc = opts.getOptarg(); break; case 'd': dateRange = opts.getOptarg(); break; case 'u': utcRange = opts.getOptarg(); break; case 'L': log4jExtraConfig = opts.getOptarg(); break; default: return -2; } } /* Helper resources -- possibly just replaced by existing SocGeoBase utilities. */ XponentsTaggerDemo.initResources(); /* Job App Configuration */ Job job = Job.getInstance(getConf(), "Xponents-Tagging"); job.setJarByClass(XponentsTaggerDemo.class); // REQUIRED: if (cc == null || phase == null) { System.err.println("Phase and CC must be set"); return -2; } job.getConfiguration().set("phase", phase); job.getConfiguration().set("country", cc); if (dateRange != null) { job.getConfiguration().set("date-range", dateRange); } if (utcRange != null) { job.getConfiguration().set("utc-range", utcRange); } /* Mapper */ if (phase.equalsIgnoreCase("geotag")) { job.setMapperClass(GeoTaggerMapper.class); } else if (phase.equalsIgnoreCase("xtax")) { job.setMapperClass(KeywordTaggerMapper.class); } if (log4jExtraConfig != null) { job.getConfiguration().set(LOG4J_SUPPLEMENTARY_CONFIGURATION, log4jExtraConfig); } /* No Reduce step */ job.setNumReduceTasks(1); /* Job Input */ job.setInputFormatClass(SequenceFileInputFormat.class); /* Map Phase ouptut */ SequenceFileInputFormat.addInputPaths(job, inPath); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); /* Job Output */ job.getConfiguration().setBoolean("mapreduce.map.output.compress", true); job.getConfiguration().set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, new Path(outPath)); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); job.getConfiguration().set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java
License:Open Source License
protected boolean runDictionaryJobSampling() throws IOException, ClassNotFoundException, InterruptedException { boolean jobOK; Job job = null;/*from w w w . ja va 2 s .c o m*/ // if input path does not exists, fail if (!this.inputFS.exists(this.conf.getInputPath())) { System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath()); System.exit(-1); } // if samples path exists... if (this.dictionaryFS.exists(this.conf.getDictionarySamplesPath())) { if (this.conf.getDeleteDictionarySamplesPath()) { // ... and option provided, delete recursively this.dictionaryFS.delete(this.conf.getDictionarySamplesPath(), true); } else { // ... and option not provided, fail System.out.println("Dictionary samples path does exist: " + this.conf.getDictionarySamplesPath()); System.out.println("Select other path or use option -ds to overwrite"); System.exit(-1); } } // Job to create a SequenceInputFormat with Roles job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 1"); job.setJarByClass(HDTBuilderDriver.class); System.out.println("input = " + this.conf.getInputPath()); System.out.println("samples = " + this.conf.getDictionarySamplesPath()); FileInputFormat.addInputPath(job, this.conf.getInputPath()); FileOutputFormat.setOutputPath(job, this.conf.getDictionarySamplesPath()); job.setInputFormatClass(LzoTextInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); job.setMapperClass(DictionarySamplerMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setCombinerClass(DictionarySamplerReducer.class); job.setReducerClass(DictionarySamplerReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(this.conf.getDictionarySampleReducers()); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); jobOK = job.waitForCompletion(true); return jobOK; }
From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java
License:Open Source License
protected boolean runDictionaryJob() throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException { boolean jobOK; Job job = null;// ww w . j av a2 s . com BufferedWriter bufferedWriter; // if output path exists... if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) { if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true); } else { // ... and option not provided, fail System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath()); System.out.println("Select other path or use option -dd to overwrite"); System.exit(-1); } } // Sample the SequenceInputFormat to do TotalSort and create final output job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2"); job.setJarByClass(HDTBuilderDriver.class); System.out.println("samples = " + this.conf.getDictionarySamplesPath()); System.out.println("output = " + this.conf.getDictionaryOutputPath()); FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath()); FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath()); job.setInputFormatClass(SequenceFileInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); // Identity Mapper // job.setMapperClass(Mapper.class); job.setCombinerClass(DictionaryCombiner.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setReducerClass(DictionaryReducer.class); job.setNumReduceTasks(this.conf.getDictionaryReducers()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); System.out.println("Sampling started"); InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability())); String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration()); URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH); DistributedCache.addCacheFile(partitionUri, job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); System.out.println("Sampling finished"); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); jobOK = job.waitForCompletion(true); this.numShared = job.getCounters().findCounter(Counters.Shared).getValue(); this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue(); this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue(); this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue(); bufferedWriter = new BufferedWriter( new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile()))); bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n"); bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n"); bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n"); bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n"); bufferedWriter.close(); return jobOK; }
From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java
License:Open Source License
protected boolean runTriplesJobSampling() throws ClassNotFoundException, IOException, InterruptedException { Job job = null;//from w ww.j a v a 2s .c o m boolean jobOK; BufferedWriter bufferedWriter; // if input path does not exists, fail if (!this.inputFS.exists(this.conf.getInputPath())) { System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath()); System.exit(-1); } // if dictionary output path does not exists, fail if (!this.dictionaryFS.exists(this.conf.getInputPath())) { System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath()); System.exit(-1); } // if samples path exists, fail if (this.dictionaryFS.exists(this.conf.getTriplesSamplesPath())) { if (this.conf.getDeleteTriplesSamplesPath()) { // ... and option // provided, delete // recursively this.dictionaryFS.delete(this.conf.getTriplesSamplesPath(), true); } else { // ... and option not provided, fail System.out.println("Triples samples path does exist: " + this.conf.getTriplesSamplesPath()); System.out.println("Select other path or use option -dst to overwrite"); System.exit(-1); } } this.conf.setProperty("mapred.child.java.opts", "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m"); // Job to create a SequenceInputFormat job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 1"); job.setJarByClass(HDTBuilderDriver.class); FileInputFormat.addInputPath(job, this.conf.getInputPath()); FileOutputFormat.setOutputPath(job, this.conf.getTriplesSamplesPath()); job.setInputFormatClass(LzoTextInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); job.setMapperClass(TriplesSPOMapper.class); job.setSortComparatorClass(TripleSPOComparator.class); job.setGroupingComparatorClass(TripleSPOComparator.class); job.setMapOutputKeyClass(TripleSPOWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(TripleSPOWritable.class); job.setOutputValueClass(NullWritable.class); job.setNumReduceTasks(this.conf.getTriplesReducers()); DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration()); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); jobOK = job.waitForCompletion(true); this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue(); bufferedWriter = new BufferedWriter( new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile()))); bufferedWriter.write(this.numTriples.toString() + "\n"); bufferedWriter.close(); return jobOK; }
From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java
License:Open Source License
protected boolean runTriplesJob() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException { Job job = null;//w w w . j a v a 2 s . c om boolean jobOK; // if triples output path exists... if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) { if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively this.triplesFS.delete(this.conf.getTriplesOutputPath(), true); } else { // ... and option not provided, fail System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath()); System.out.println("Select other path or use option -dt to overwrite"); System.exit(-1); } } job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 2"); job.setJarByClass(HDTBuilderDriver.class); FileInputFormat.addInputPath(job, this.conf.getTriplesSamplesPath()); FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath()); job.setInputFormatClass(SequenceFileInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); job.setSortComparatorClass(TripleSPOComparator.class); job.setGroupingComparatorClass(TripleSPOComparator.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setOutputKeyClass(TripleSPOWritable.class); job.setOutputValueClass(NullWritable.class); job.setNumReduceTasks(this.conf.getTriplesReducers()); System.out.println("Sampling started"); InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability())); String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration()); URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH); DistributedCache.addCacheFile(partitionUri, job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); System.out.println("Sampling finished"); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); jobOK = job.waitForCompletion(true); return jobOK; }
From source file:wikiduper.clir.rp.RepackText.java
License:Apache License
@SuppressWarnings("static-access") @Override/*from w ww.j a va2 s. c o m*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location") .create(OUTPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file") .create(MAPPING_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("block|record|none").hasArg() .withDescription("compression type").create(COMPRESSION_TYPE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg().withDescription("two-letter language code") .create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION); String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION); if (!"block".equals(compressionType) && !"record".equals(compressionType) && !"none".equals(compressionType)) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } // this is the default block size int blocksize = 1000000; Job job = Job.getInstance(getConf()); job.setJarByClass(RepackText.class); job.setJobName(String.format("RepackText[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language)); job.getConfiguration().set(DOCNO_MAPPING_FIELD, mappingFile); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - XML dump file: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - docno mapping data file: " + mappingFile); LOG.info(" - compression type: " + compressionType); LOG.info(" - language: " + language); if ("block".equals(compressionType)) { LOG.info(" - block size: " + blocksize); } job.setNumReduceTasks(0); SequenceFileInputFormat.addInputPath(job, new Path(inputPath)); SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath)); if ("none".equals(compressionType)) { SequenceFileOutputFormat.setCompressOutput(job, false); } else { SequenceFileOutputFormat.setCompressOutput(job, true); if ("record".equals(compressionType)) { SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize); } } if (language != null) { job.getConfiguration().set("wiki.language", language); } job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(TextDocument.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }