List of usage examples for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType
public static void setOutputCompressionType(Job job, CompressionType style)
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = JobBuilder.parseInputAndOutput(this, getConf(), args); if (job == null) { return -1; }/*from www . j a v a2 s .c o m*/ job.setMapperClass(CleanerMapper.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(0); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); return job.waitForCompletion(true) ? 0 : 1; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws IOException { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; }/*from www.java 2 s . co m*/ conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); /*[*/conf.setOutputFormat(MapFileOutputFormat.class);/*]*/ SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); JobClient.runJob(conf); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws IOException { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; }// ww w . j a v a 2 s .c o m conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); JobClient.runJob(conf); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws Exception { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; }/*from ww w. j a v a 2s . c om*/ conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); conf.setPartitionerClass(TotalOrderPartitioner.class); InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(0.1, 10000, 10); Path input = FileInputFormat.getInputPaths(conf)[0]; input = input.makeQualified(input.getFileSystem(conf)); Path partitionFile = new Path(input, "_partitions"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.writePartitionFile(conf, sampler); // Add to DistributedCache URI partitionUri = new URI(partitionFile.toString() + "#_partitions"); DistributedCache.addCacheFile(partitionUri, conf); DistributedCache.createSymlink(conf); JobClient.runJob(conf); return 0; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public int run(String[] args) throws IOException { JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args); if (conf == null) { return -1; }/*from w ww . j a v a 2 s. co m*/ conf.setMapperClass(CleanerMapper.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setNumReduceTasks(0); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK); JobClient.runJob(conf); return 0; }
From source file:edu.indiana.d2i.htrc.corpus.clean.CleanCorpusDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // TODO Auto-generated method stub /**/*from w w w . j a v a 2 s .c om*/ * Specify the # of reducers through -D * mapred.reduce.tasks=<numOfReducers> in hadoop command line. Specify * whether compression is used through -D * user.args.compression=<true/false> */ if (args.length != 2) { System.err.printf( "Usage: %s [generic options] </path/to/input/directory> </path/to/output/directory>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Configuration conf = getConf(); Job job = new Job(conf, "HTRC Cleaning Raw Corpus"); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJarByClass(CleanCorpusDriver.class); job.setMapperClass(CleanCorpusMapper.class); job.setReducerClass(CleanCorpusReducer.class); if (conf.getBoolean("user.args.compression", false)) { /* use compression */ SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(TextArrayWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.indiana.d2i.htrc.corpus.retrieve.RetrieveRawCorpusDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // TODO Auto-generated method stub /**//from w w w .j a v a 2 s. c o m * Specify the # of reducers through -D * mapred.reduce.tasks=<numOfReducers> in hadoop command line. Specify * using compression through -D user.args.compression=true */ if (args.length != 3) { System.err.printf( "Usage: %s [generic options] </path/to/input/directory> </path/to/output/directory> </path/to/property/file>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Configuration conf = getConf(); Properties prop = new Properties(); prop.load(new FileInputStream(args[2])); // set configuration parameters // data api related parameters conf.set(Constants.DATA_API_EPR, prop.getProperty(Constants.DATA_API_EPR)); conf.set(Constants.DATA_API_CONCAT, prop.getProperty(Constants.DATA_API_CONCAT)); conf.set(Constants.DATA_API_SELFSIGN, prop.getProperty(Constants.DATA_API_SELFSIGN)); conf.set(Constants.DATA_API_DELIMITER, prop.getProperty(Constants.DATA_API_DELIMITER)); conf.set(Constants.DATA_API_VOL_PREFIX, prop.getProperty(Constants.DATA_API_VOL_PREFIX)); conf.set(Constants.DATA_API_PAGE_PREFIX, prop.getProperty(Constants.DATA_API_PAGE_PREFIX)); conf.set(Constants.DATA_API_REQ_SIZE, prop.getProperty(Constants.DATA_API_REQ_SIZE)); // oauth2 related parameters conf.set(Constants.OAUTH2_EPR, prop.getProperty(Constants.OAUTH2_EPR)); conf.set(Constants.OAUTH2_USER_NAME, prop.getProperty(Constants.OAUTH2_USER_NAME)); conf.set(Constants.OAUTH2_USER_PASSWORD, prop.getProperty(Constants.OAUTH2_USER_PASSWORD)); // set # of lines (volumes in our case) to be processed by one map task conf.set("mapreduce.input.lineinputformat.linespermap", prop.getProperty(Constants.NUM_VOLUMES_PER_MAPPER)); Job job = new Job(conf, "HTRC Retrieving Raw Corpus"); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJarByClass(RetrieveRawCorpusDriver.class); job.setMapperClass(RetrieveRawCorpusMapper.class); job.setReducerClass(RetrieveRawCorpusReducer.class); if (conf.getBoolean("user.args.compression", false)) { /* use compression */ SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(TextArrayWritable.class); job.setInputFormatClass(NLineInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.indiana.d2i.htrc.corpus.transform.CorpusTransformDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { // TODO Auto-generated method stub /**//w ww. j a v a 2 s .com * Specify the # of reducers through -D * mapred.reduce.tasks=<numOfReducers> in hadoop command line. Specify * whether compression is used through -D user.args.compression=true, * use -D user.args.wordset.filename=<wordset_filename> to set wordset * filename, use -files </local/path/to/wordset_file> to distribute * wordset_file to each compute node */ if (args.length != 2) { System.err.printf( "Usage: %s [generic options] </path/to/input/directory> </path/to/output/directory>\n", getClass().getSimpleName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Configuration conf = getConf(); Job job = new Job(conf, "HTRC Transforming Corpus"); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJarByClass(CorpusTransformDriver.class); job.setMapperClass(CorpusTransformMapper.class); job.setReducerClass(CorpusTransformReducer.class); if (conf.getBoolean("user.args.compression", false)) { /* use compression */ SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); SequenceFileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(TextArrayWritable.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.isi.mavuno.app.mine.HarvestContextPatternPairs.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorClass", conf);/* ww w. j a v a 2s .co m*/ String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorArgs", conf); String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.MinMatches", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.OutputPath", conf); sLogger.info("Tool name: HarvestContextPatternPairs"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor args: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestContextPatternPairs"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.app.nlp.ProcessStanfordNLP.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); // required parameters String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusClass", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.OutputPath", conf); // optional parameters String suTime = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.UseSUTime", conf); String textOutput = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.TextOutputFormat", conf); sLogger.info("Tool name: ProcessStanfordNLP"); sLogger.info(" - Input path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Output path: " + outputPath); if (suTime != null && Boolean.parseBoolean(suTime)) { sLogger.info("- SUTime enabled"); }//w w w. j a v a2 s . c o m boolean textOutputFormat = false; if (textOutput != null && Boolean.parseBoolean(textOutput)) { sLogger.info("- Text output format enabled"); textOutputFormat = true; } Job job = new Job(conf); job.setJobName("ProcessStanfordNLP"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); // output format -- either plain text or sequencefile (default) if (textOutputFormat) { job.setOutputFormatClass(TextOutputFormat.class); } else { job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); } job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StanfordParsedDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StanfordParsedDocument.class); job.setMapperClass(MyMapper.class); job.setJarByClass(ProcessStanfordNLP.class); // no reducers needed job.setNumReduceTasks(0); // run job job.waitForCompletion(true); // print job statistics Counters counters = job.getCounters(); sLogger.info(" - Total documents: " + counters.findCounter(MyCounters.TOTAL_DOCUMENTS).getValue()); sLogger.info(" - Total sentences: " + counters.findCounter(MyCounters.TOTAL_SENTENCES).getValue()); sLogger.info(" - Total tokens: " + counters.findCounter(MyCounters.TOTAL_TOKENS).getValue()); return 0; }