List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputFormat setCompressOutput
public static void setCompressOutput(Job job, boolean compress)
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.utils.PagesByURLExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); for (Map.Entry<String, String> next : job.getConfiguration()) { System.out.println(next.getKey() + ": " + next.getValue()); }//from www. j a v a2 s .c om job.setJarByClass(PagesByURLExtractor.class); job.setJobName(PagesByURLExtractor.class.getName()); // mapper job.setMapperClass(MapperClass.class); // input job.setInputFormatClass(WARCInputFormat.class); // output job.setOutputFormatClass(WARCOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); FileOutputFormat.setCompressOutput(job, true); // paths String commaSeparatedInputFiles = args[0]; String outputPath = args[1]; // load IDs to be searched for job.getConfiguration().set(MAPREDUCE_MAPPER_URLS, loadURLs(args[2])); FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.dkpro.c4corpus.hadoop.utils.URIExtractor.java
License:Apache License
/** * {@inheritDoc}/*from w ww . ja v a2s. co m*/ */ @Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); // set from the command line job.setJarByClass(URIExtractor.class); job.setJobName(URIExtractor.class.getName()); // mapper job.setMapperClass(URIExtractorMapper.class); job.setReducerClass(URIExtractorReducer.class); // input-output is warc job.setInputFormatClass(WARCInputFormat.class); // is necessary, so that Hadoop does not mix the map input format up. job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); // set output compression to GZip FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }
From source file:de.tudarmstadt.ukp.experiments.dip.hadoop.ClueWebTRECIdFileExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); for (Map.Entry<String, String> next : job.getConfiguration()) { System.out.println(next.getKey() + ": " + next.getValue()); }// w w w . ja v a 2s .c o m job.setJarByClass(ClueWebTRECIdFileExtractor.class); job.setJobName(ClueWebTRECIdFileExtractor.class.getName()); // mapper job.setMapperClass(MapperClass.class); // input job.setInputFormatClass(WARCInputFormat.class); // output job.setOutputFormatClass(WARCOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(WARCWritable.class); FileOutputFormat.setCompressOutput(job, true); // paths String commaSeparatedInputFiles = args[0]; String outputPath = args[1]; // load IDs to be searched for job.getConfiguration().set(MAPREDUCE_MAPPER_TREC_IDS, loadTrecIds(args[2])); FileInputFormat.addInputPaths(job, commaSeparatedInputFiles); FileOutputFormat.setOutputPath(job, new Path(outputPath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:edu.isi.mavuno.app.mine.HarvestContextPatternPairs.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorClass", conf);/*from w ww . j a va 2 s . c o m*/ String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.ExtractorArgs", conf); String minMatches = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.MinMatches", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.HarvestContextPatternPairs.OutputPath", conf); sLogger.info("Tool name: HarvestContextPatternPairs"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor args: " + extractorArgs); sLogger.info(" - Min matches: " + minMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("HarvestContextPatternPairs"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(LongWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.app.nlp.ProcessStanfordNLP.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); // required parameters String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.CorpusClass", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.ProcessStanfordNLP.OutputPath", conf); // optional parameters String suTime = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.UseSUTime", conf); String textOutput = MavunoUtils.getOptionalParam("Mavuno.ProcessStanfordNLP.TextOutputFormat", conf); sLogger.info("Tool name: ProcessStanfordNLP"); sLogger.info(" - Input path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Output path: " + outputPath); if (suTime != null && Boolean.parseBoolean(suTime)) { sLogger.info("- SUTime enabled"); }/* ww w. j a v a 2 s. co m*/ boolean textOutputFormat = false; if (textOutput != null && Boolean.parseBoolean(textOutput)) { sLogger.info("- Text output format enabled"); textOutputFormat = true; } Job job = new Job(conf); job.setJobName("ProcessStanfordNLP"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); // output format -- either plain text or sequencefile (default) if (textOutputFormat) { job.setOutputFormatClass(TextOutputFormat.class); } else { job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); } job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(StanfordParsedDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(StanfordParsedDocument.class); job.setMapperClass(MyMapper.class); job.setJarByClass(ProcessStanfordNLP.class); // no reducers needed job.setNumReduceTasks(0); // run job job.waitForCompletion(true); // print job statistics Counters counters = job.getCounters(); sLogger.info(" - Total documents: " + counters.findCounter(MyCounters.TOTAL_DOCUMENTS).getValue()); sLogger.info(" - Total sentences: " + counters.findCounter(MyCounters.TOTAL_SENTENCES).getValue()); sLogger.info(" - Total tokens: " + counters.findCounter(MyCounters.TOTAL_TOKENS).getValue()); return 0; }
From source file:edu.isi.mavuno.app.nlp.TratzParse.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusClass", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.Parse.OutputPath", conf); // optional parameter that allows the parsed documents to be output in text format String textOutput = MavunoUtils.getOptionalParam("Mavuno.Parse.TextOutputFormat", conf); boolean textOutputFormat = false; if (textOutput != null && Boolean.parseBoolean(textOutput)) { textOutputFormat = true;/*from w ww.j av a2 s . co m*/ } sLogger.info("Tool name: TratzParse"); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("TratzParse"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); // output format -- either plain text or sequencefile (default) if (textOutputFormat) { job.setOutputFormatClass(TextOutputFormat.class); } else { job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); } job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(TratzParsedDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(TratzParsedDocument.class); job.setMapperClass(MyMapper.class); job.setJarByClass(TratzParse.class); // no reducers needed job.setNumReduceTasks(0); // run job job.waitForCompletion(true); // print job statistics Counters counters = job.getCounters(); sLogger.info(" - Total documents: " + counters.findCounter(StatCounters.TOTAL_DOCUMENTS).getValue()); sLogger.info(" - Total sentences: " + counters.findCounter(StatCounters.TOTAL_SENTENCES).getValue()); sLogger.info(" - Total tokens: " + counters.findCounter(StatCounters.TOTAL_TOKENS).getValue()); sLogger.info(" - Total dropped sentences: " + counters.findCounter(StatCounters.TOTAL_DROPPED_SENTENCES).getValue()); sLogger.info( " - Total tokenization time (ms): " + counters.findCounter(StatCounters.TOKENIZE_TIME).getValue()); sLogger.info( " - Total POS tagging time (ms): " + counters.findCounter(StatCounters.POSTAG_TIME).getValue()); sLogger.info(" - Total chunking time (ms): " + counters.findCounter(StatCounters.CHUNK_TIME).getValue()); sLogger.info(" - Total named entity tagging time (ms): " + counters.findCounter(StatCounters.NETAG_TIME).getValue()); sLogger.info(" - Total parse time (ms): " + counters.findCounter(StatCounters.PARSE_TIME).getValue()); return 0; }
From source file:edu.isi.mavuno.extract.CombineGlobalStats.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.CombineGlobalStats.InputPath", conf); String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineGlobalStats.OutputPath", conf); int numSplits = conf.getInt("Mavuno.CombineGlobalStats.TotalSplits", 1); sLogger.info("Tool name: CombineGlobalStats"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Output path: " + outputPath); sLogger.info(" - Number of splits: " + numSplits); Job job = new Job(conf); job.setJobName("CombineGlobalStats"); for (int split = 0; split < numSplits; split++) { FileInputFormat.addInputPath(job, new Path(inputPath + "/" + split)); }/* w w w. j a v a2s . co m*/ FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.extract.CombineSplits.java
License:Apache License
public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String examplesPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExamplesPath", conf); String exampleStatsPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExampleStatsPath", conf); String splitKey = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.SplitKey", conf).toLowerCase(); int numSplits = conf.getInt("Mavuno.CombineSplits.TotalSplits", 1); String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.OutputPath", conf); sLogger.info("Tool name: CombineSplits"); sLogger.info(" - Examples path: " + examplesPath); sLogger.info(" - Example stats path: " + exampleStatsPath); sLogger.info(" - Split key: " + splitKey); sLogger.info(" - Total splits: " + numSplits); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("CombineSplits"); for (int split = 0; split < numSplits; split++) { FileInputFormat.addInputPath(job, new Path(examplesPath + "/" + split)); }/*from w ww .j a va 2 s. c om*/ if (MavunoUtils.pathExists(conf, exampleStatsPath)) { FileInputFormat.addInputPath(job, new Path(exampleStatsPath)); } FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); if ("pattern".equals(splitKey)) { job.setSortComparatorClass(ContextPatternWritable.Comparator.class); } else if ("context".equals(splitKey)) { job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class); } else if ("pattern+context".equals(splitKey)) { job.setSortComparatorClass(ContextPatternWritable.Comparator.class); } else { throw new RuntimeException("Invalid SplitKey in CombineSplits! -- " + splitKey); } job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.extract.Extract.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.InputPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorArgs", conf); String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorTarget", conf).toLowerCase(); int minContextMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.Extract.MinMatches", conf)); String outputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.OutputPath", conf); sLogger.info("Tool name: Extract"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor arguments: " + extractorArgs); sLogger.info(" - Extractor target: " + extractorTarget); sLogger.info(" - Min context matches: " + minContextMatches); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("Extract"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); if ("pattern".equals(extractorTarget)) { job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class); } else if ("context".equals(extractorTarget)) { job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class); job.setPartitionerClass(ContextPatternWritable.IdPatternPartitioner.class); } else {/*from w w w. j a va 2 s . co m*/ throw new RuntimeException("Invalid extractor target in Extract -- " + extractorTarget); } job.setMapOutputKeyClass(ContextPatternWritable.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyCombiner.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); return 0; }
From source file:edu.isi.mavuno.extract.ExtractGlobalStats.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) public int run() throws ClassNotFoundException, InterruptedException, IOException { Configuration conf = getConf(); String inputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.InputPath", conf); String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusPath", conf); String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusClass", conf); String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorClass", conf); String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorArgs", conf); String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorTarget", conf) .toLowerCase();//from w ww . j a v a 2 s .c o m String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.OutputPath", conf); // split examples conf.set("Mavuno.Split.InputPath", inputPath); conf.set("Mavuno.Split.OutputPath", outputPath + "/../split"); conf.set("Mavuno.Split.SplitKey", extractorTarget); new Split(conf).run(); // get splits FileStatus[] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/../split"); int split = 0; for (FileStatus file : files) { if (!file.getPath().getName().endsWith(".examples")) { continue; } conf.set("Mavuno.ExtractGlobalStats.ExamplesPath", file.getPath().toString()); sLogger.info("Tool name: ExtractGlobalStats"); sLogger.info(" - Input path: " + inputPath); sLogger.info(" - Examples path: " + file.getPath()); sLogger.info(" - Example split: " + split); sLogger.info(" - Corpus path: " + corpusPath); sLogger.info(" - Corpus class: " + corpusClass); sLogger.info(" - Extractor class: " + extractorClass); sLogger.info(" - Extractor class: " + extractorArgs); sLogger.info(" - Extractor target: " + extractorTarget); sLogger.info(" - Output path: " + outputPath); Job job = new Job(conf); job.setJobName("ExtractGlobalStats"); MavunoUtils.recursivelyAddInputPaths(job, corpusPath); FileOutputFormat.setOutputPath(job, new Path(outputPath + "/../split/" + split)); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass)); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(ContextPatternWritable.class); job.setSortComparatorClass(ContextPatternWritable.Comparator.class); job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class); job.setMapOutputValueClass(ContextPatternStatsWritable.class); job.setOutputKeyClass(ContextPatternWritable.class); job.setOutputValueClass(ContextPatternStatsWritable.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.waitForCompletion(true); split++; } // combine splits conf.setInt("Mavuno.CombineGlobalStats.TotalSplits", split); conf.set("Mavuno.CombineGlobalStats.InputPath", outputPath + "/../split/"); conf.set("Mavuno.CombineGlobalStats.OutputPath", outputPath); new CombineGlobalStats(conf).run(); MavunoUtils.removeDirectory(conf, outputPath + "/../split"); return 0; }