List of usage examples for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType
public static void setOutputCompressionType(Job job, CompressionType style)
From source file:edu.umd.cloud9.collection.ExtractHTMLFieldCollection.java
License:Apache License
@SuppressWarnings({ "unchecked", "rawtypes" }) @Override/* ww w . j av a2s .c o m*/ public int runTool() throws Exception { Configuration conf = getConf(); Job job = new Job(conf); String inputPath = conf.get("Cloud9.InputPath"); String inputFormat = conf.get("Cloud9.InputFormat"); String outputPath = conf.get("Cloud9.OutputPath"); String tag = conf.get("Cloud9.TargetTag"); job.setJobName("ExtractFieldCollection"); job.setJarByClass(ExtractHTMLFieldCollection.class); job.setMapperClass(MyMapper.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(200); job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat)); recursivelyAddInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(TextDocument.class); LOG.info("ExtractFieldCollection - " + tag); LOG.info(" - Input path: " + inputPath); LOG.info(" - Input format: " + inputFormat); LOG.info(" - Output path: " + outputPath); LOG.info(" - Target tag: " + tag); job.waitForCompletion(true); return 0; }
From source file:edu.umd.cloud9.collection.trecweb.RepackTrecWebCollection.java
License:Apache License
/** * Runs this tool./* w w w. j a v a 2 s. c om*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) collection path") .create(COLLECTION_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("(required) output path") .create(OUTPUT_OPTION)); options.addOption(OptionBuilder.withArgName("type").hasArg() .withDescription("(required) compression type: 'block', 'record', or 'none'") .create(COMPRESSION_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(COMPRESSION_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String collection = cmdline.getOptionValue(COLLECTION_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); String compressionType = cmdline.getOptionValue(COMPRESSION_OPTION); if (!compressionType.equals("block") && !compressionType.equals("record") && !compressionType.equals("none")) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); System.exit(-1); } // This is the default block size. int blocksize = 1000000; Job job = new Job(getConf(), RepackTrecWebCollection.class.getSimpleName() + ":" + collection); FileSystem fs = FileSystem.get(job.getConfiguration()); job.setJarByClass(RepackTrecWebCollection.class); LOG.info("Tool name: " + RepackTrecWebCollection.class.getCanonicalName()); LOG.info(" - collection path: " + collection); LOG.info(" - output path: " + output); LOG.info(" - compression type: " + compressionType); if (compressionType.equals("block")) { LOG.info(" - block size: " + blocksize); } Path collectionPath = new Path(collection); for (FileStatus status : fs.listStatus(collectionPath)) { if (status.isDirectory()) { for (FileStatus s : fs.listStatus(status.getPath())) { FileInputFormat.addInputPath(job, s.getPath()); } } else { FileInputFormat.addInputPath(job, status.getPath()); } } // Hack to figure out number of reducers. int numReducers = 100; if (collection.toLowerCase().contains("wt10g")) { numReducers = 50; } else if (collection.toLowerCase().contains("gov2")) { numReducers = 200; } LOG.info(" - number of reducers: " + numReducers); job.setNumReduceTasks(numReducers); FileOutputFormat.setOutputPath(job, new Path(output)); if (compressionType.equals("none")) { SequenceFileOutputFormat.setCompressOutput(job, false); } else { SequenceFileOutputFormat.setCompressOutput(job, true); if (compressionType.equals("record")) { SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize); } } job.setInputFormatClass(TrecWebDocumentInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(TrecWebDocument.class); job.setMapperClass(MyMapper.class); // delete the output directory if it exists already fs.delete(new Path(output), true); try { job.waitForCompletion(true); } catch (Exception e) { throw new RuntimeException(e); } return 0; }
From source file:edu.umd.cloud9.collection.wikipedia.RepackWikipedia.java
License:Apache License
@SuppressWarnings("static-access") @Override/*from w w w .j av a 2 s. c o m*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location") .create(OUTPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file") .create(MAPPING_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("block|record|none").hasArg() .withDescription("compression type").create(COMPRESSION_TYPE_OPTION)); options.addOption(OptionBuilder .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr") .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION); String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION); if (!"block".equals(compressionType) && !"record".equals(compressionType) && !"none".equals(compressionType)) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (!(language.length() == 2 || language.length() == 6)) { // Added length check for 6 to include languages like zh_yue System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } // this is the default block size int blocksize = 1000000; Job job = Job.getInstance(getConf()); job.setJarByClass(RepackWikipedia.class); job.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language)); job.getConfiguration().set(DOCNO_MAPPING_FIELD, mappingFile); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - XML dump file: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - docno mapping data file: " + mappingFile); LOG.info(" - compression type: " + compressionType); LOG.info(" - language: " + language); if ("block".equals(compressionType)) { LOG.info(" - block size: " + blocksize); } job.setNumReduceTasks(0); SequenceFileInputFormat.addInputPath(job, new Path(inputPath)); SequenceFileOutputFormat.setOutputPath(job, new Path(outputPath)); if ("none".equals(compressionType)) { SequenceFileOutputFormat.setCompressOutput(job, false); } else { SequenceFileOutputFormat.setCompressOutput(job, true); if ("record".equals(compressionType)) { SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.getConfiguration().setInt("io.seqfile.compress.blocksize", blocksize); } } if (language != null) { job.getConfiguration().set("wiki.language", language); } job.setInputFormatClass(WikipediaPageInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(WikipediaPageFactory.getWikipediaPageClass(language)); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(getConf()).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
From source file:edu.umd.cloud9.webgraph.driver.wt10g.GenericExtractLinks.java
License:Apache License
@Override public int runTool() throws Exception { Configuration conf = getConf(); Job job = new Job(conf); int numReducers = conf.getInt("Cloud9.Reducers", 200); String inputPath = conf.get("Cloud9.InputPath"); String outputPath = conf.get("Cloud9.OutputPath"); String mappingFile = conf.get("Cloud9.DocnoMappingFile"); FileSystem fs = FileSystem.get(conf); if (!fs.exists(new Path(mappingFile))) { throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!"); }// ww w .jav a2s . co m DistributedCache.addCacheFile(new Path(mappingFile).toUri(), job.getConfiguration()); job.setJobName("ExtractLinks"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.task.timeout", 60000000); job.setNumReduceTasks(numReducers); job.setMapperClass(GenericExtractLinks.Map.class); job.setCombinerClass(GenericExtractLinks.Reduce.class); job.setReducerClass(GenericExtractLinks.Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ArrayListWritable.class); configer.applyJobConfig(job); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); recursivelyAddInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); LOG.info("ExtractLinks"); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - mapping file: " + mappingFile); LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false)); job.waitForCompletion(true); return 0; }
From source file:edu.umd.cloud9.webgraph.TrecExtractLinks.java
License:Apache License
@Override public int runTool() throws Exception { Configuration conf = getConf(); conf.set("mapred.child.java.opts", "-Xmx3072m"); conf.setInt("mapred.task.timeout", 60000000); Job job = new Job(conf); int numReducers = conf.getInt("Cloud9.Reducers", 200); String inputPath = conf.get("Cloud9.InputPath"); String outputPath = conf.get("Cloud9.OutputPath"); String mappingFile = conf.get("Cloud9.DocnoMappingFile"); FileSystem fs = FileSystem.get(conf); if (!fs.exists(new Path(mappingFile))) { throw new RuntimeException("Error: Docno mapping data file " + mappingFile + " doesn't exist!"); }/* w w w . j av a 2 s . c om*/ DistributedCache.addCacheFile(new Path(mappingFile).toUri(), job.getConfiguration()); job.setJobName("ExtractLinks"); job.setNumReduceTasks(numReducers); job.setJarByClass(TrecExtractLinks.class); job.setMapperClass(TrecExtractLinks.Map.class); job.setCombinerClass(TrecExtractLinks.Reduce.class); job.setReducerClass(TrecExtractLinks.Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ArrayListWritable.class); configer.applyJobConfig(job); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); recursivelyAddInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, new Path(outputPath)); LOG.info("ExtractLinks"); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - mapping file: " + mappingFile); LOG.info(" - include internal links? " + conf.getBoolean("Cloud9.IncludeInternalLinks", false)); job.waitForCompletion(true); return 0; }
From source file:eu.scape_project.tb.lsdr.seqfileutility.hadoop.HadoopJob.java
License:Apache License
/** * Run hadoop job//from ww w .j a v a 2s . c o m * * @param strings Command line arguments * @return Success indicator * @throws Exception */ @Override public int run(String[] strings) throws Exception { try { String hdfsInputDir = null; FileSystem hdfs = FileSystem.get(conf); // hdfs input path is given as command parameter if (pc.getHdfsInputPath() != null) { hdfsInputDir = pc.getHdfsInputPath(); // hdfs input file is created } else { hdfsInputDir = "input/" + System.currentTimeMillis() + "sfu/"; String[] extensions = null; if (pc.getExtStr() != null) { StringTokenizer st = new StringTokenizer(pc.getExtStr(), ","); extensions = new String[st.countTokens()]; int i = 0; while (st.hasMoreTokens()) { extensions[i] = st.nextToken(); i++; } } hdfs.mkdirs(new Path(hdfsInputDir)); String hdfsIinputPath = hdfsInputDir + "inputpaths.txt"; Path path = new Path(hdfsIinputPath); FSDataOutputStream outputStream = hdfs.create(path); List<String> dirs = StringUtils.getStringListFromString(pc.getDirsStr(), ","); for (String dir : dirs) { File directory = new File(dir); if (directory.isDirectory()) { // Alternatively, the java traverse method can be used // for creating the file paths: //traverse(directory, outputStream); writeFilePaths(directory, outputStream); } else { logger.warn("Parameter \"" + dir + "\" is not a directory " + "(skipped)"); } } outputStream.close(); if (hdfs.exists(path)) { logger.info( "Input paths created in \"" + hdfs.getHomeDirectory() + "/" + path.toString() + "\""); } else { logger.error("Input paths have not been created in hdfs."); return 1; } } String hadoopJobName = "Hadoop_sequence_file_creation"; if (pc.getHadoopJobName() != null && !pc.getHadoopJobName().equals("")) hadoopJobName = pc.getHadoopJobName(); Job job = new Job(conf, hadoopJobName); job.setJarByClass(SequenceFileUtility.class); job.setMapperClass(SmallFilesSequenceFileMapper.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BytesWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); TextInputFormat.addInputPath(job, new Path(hdfsInputDir)); // todo: support absolute paths String hdfsOutputDir = pc.getOutputDirectory() != null ? pc.getOutputDirectory() : "output/" + System.currentTimeMillis() + "sfu/"; SequenceFileOutputFormat.setOutputPath(job, new Path(hdfsOutputDir)); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.get(pc.getCompressionType())); int success = job.waitForCompletion(true) ? 0 : 1; boolean seqFileExists = hdfs.exists(new Path(hdfsOutputDir + "part-r-00000")); if (success == 0 && seqFileExists) { logger.info("Sequence file created: \"" //+ hdfs.getHomeDirectory() + "/" + new Path(hdfsOutputDir).toString() + "/part-r-00000" + "\""); pc.setOutputDirectory(hdfsOutputDir); return 0; } else { logger.error("Sequence file not created in hdfs"); return 1; } } catch (Exception e) { logger.error("IOException occurred", e); } finally { } return 0; }
From source file:gaffer.accumulo.splitpoints.EstimateSplitPointsDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 5) { System.err.println("Usage: " + this.getClass().getName() + " <mapred_output_directory> <proportion_to_sample> <number_of_tablet_servers> <resulting_split_file> <input_path1>..."); return 1; }//from www . ja v a 2 s . co m // Parse arguments Path outputPath = new Path(args[0]); float proportionToSample = Float.parseFloat(args[1]); int numberTabletServers = Integer.parseInt(args[2]); Path resultingSplitsFile = new Path(args[3]); Path[] inputPaths = new Path[args.length - 4]; for (int i = 0; i < inputPaths.length; i++) { inputPaths[i] = new Path(args[i + 4]); } // Conf and job Configuration conf = getConf(); conf.setFloat("proportion_to_sample", proportionToSample); String jobName = "Estimate split points: input = "; for (int i = 0; i < inputPaths.length; i++) { jobName += inputPaths[i] + ", "; } jobName += "output = " + outputPath; Job job = Job.getInstance(conf, jobName); job.setJarByClass(getClass()); // Input job.setInputFormatClass(SequenceFileInputFormat.class); for (int i = 0; i < inputPaths.length; i++) { SequenceFileInputFormat.addInputPath(job, inputPaths[i]); } // Mapper job.setMapperClass(EstimateSplitPointsMapper.class); job.setMapOutputKeyClass(Key.class); job.setMapOutputValueClass(Value.class); // Reducer job.setReducerClass(EstimateSplitPointsReducer.class); job.setOutputKeyClass(Key.class); job.setOutputValueClass(Value.class); job.setNumReduceTasks(1); // Output job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, outputPath); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); // Run job job.waitForCompletion(true); // Successful? if (!job.isSuccessful()) { System.err.println("Error running job"); return 1; } // Number of records output // NB In the following line use mapred.Task.Counter.REDUCE_OUTPUT_RECORDS rather than // mapreduce.TaskCounter.REDUCE_OUTPUT_RECORDS as this is more compatible with earlier // versions of Hadoop. @SuppressWarnings("deprecation") Counter counter = job.getCounters() .findCounter(org.apache.hadoop.mapred.Task.Counter.REDUCE_OUTPUT_RECORDS); long recordsOutput = counter.getValue(); System.out.println("Number of records output = " + recordsOutput); // Work out when to output a split point. The number of split points // needed is the number of tablet servers minus 1 (because you don't // have to output the start of the first tablet or the end of the // last tablet). long outputEveryNthRecord = recordsOutput / (numberTabletServers - 1); // Read through resulting file, pick out the split points and write to // file. FileSystem fs = FileSystem.get(conf); Path resultsFile = new Path(outputPath, "part-r-00000"); @SuppressWarnings("deprecation") SequenceFile.Reader reader = new SequenceFile.Reader(fs, resultsFile, conf); PrintStream splitsWriter = new PrintStream(new BufferedOutputStream(fs.create(resultingSplitsFile, true))); Key key = new Key(); Value value = new Value(); long count = 0; int numberSplitPointsOutput = 0; while (reader.next(key, value) && numberSplitPointsOutput < numberTabletServers - 1) { count++; if (count % outputEveryNthRecord == 0) { numberSplitPointsOutput++; splitsWriter.println(new String(Base64.encodeBase64(key.getRow().getBytes()))); System.out.println("Written split point: " + key.getRow()); } } reader.close(); splitsWriter.close(); System.out.println("Number of split points output = " + numberSplitPointsOutput); return 0; }
From source file:gaffer.accumulostore.operation.hdfs.handler.job.factory.SampleDataForSplitPointsJobFactory.java
License:Apache License
private void setupOutput(final Job job, final SampleDataForSplitPoints operation, final Store store) throws IOException { job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, new Path(operation.getOutputPath())); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); }
From source file:gaffer.accumulostore.operation.hdfs.handler.job.SampleDataForSplitPointsJobFactory.java
License:Apache License
private void setupOutput(final Job job, final SampleDataForSplitPoints operation, final Store store) throws IOException { job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, operation.getOutputPath()); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); }
From source file:hadoop.TrainingDriver.java
License:Open Source License
public int run(String[] args) throws Exception { Configuration conf = getConf(); String input = conf.get("gc.TrainingDriver.input"); String output = conf.get("gc.TrainingDriver.output"); String dataset = conf.get("gc.TrainingDriver.dataset"); String jobname = conf.get("gc.TrainingDriver.name"); if (input == null || output == null || dataset == null || jobname == null) { System.out.println(" Incorrect parameters "); System.exit(0);// w w w . ja va2 s . c o m } conf = addPathToDC(conf, conf.get("gc.TrainingDriver.dataset") + "*"); Job job = new Job(conf); job.setJarByClass(TrainingDriver.class); job.setJobName(jobname); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(WeightParameter.class); job.setMapperClass(TrainingDriverMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(WeightParameter.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.RECORD); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, new Path(output)); System.out.println(" Input dir = " + input); System.out.println(" Output dir = " + output); System.out.println(" Training Input = " + dataset); System.out.println(" Name = " + jobname); if (job.waitForCompletion(true) == false) { System.err.println(" Job " + jobname + " Failed (miserably)"); System.exit(2); } return 0; }