List of usage examples for org.apache.hadoop.mapred FileOutputFormat setOutputCompressorClass
public static void setOutputCompressorClass(JobConf conf, Class<? extends CompressionCodec> codecClass)
From source file:org.apache.flink.batch.connectors.hive.HiveTableOutputFormat.java
License:Apache License
private HivePartitionWriter writerForLocation(String location) throws IOException { JobConf clonedConf = new JobConf(jobConf); clonedConf.set(OUTDIR, location);//ww w . ja v a 2 s . c o m OutputFormat outputFormat; try { StorageDescriptor sd = hiveTablePartition.getStorageDescriptor(); Class outputFormatClz = Class.forName(sd.getOutputFormat(), true, Thread.currentThread().getContextClassLoader()); outputFormatClz = HiveFileFormatUtils.getOutputFormatSubstitute(outputFormatClz); outputFormat = (OutputFormat) outputFormatClz.newInstance(); } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) { throw new FlinkRuntimeException("Unable to instantiate the hadoop output format", e); } ReflectionUtils.setConf(outputFormat, clonedConf); OutputCommitter outputCommitter = clonedConf.getOutputCommitter(); JobContext jobContext = new JobContextImpl(clonedConf, new JobID()); outputCommitter.setupJob(jobContext); final boolean isCompressed = clonedConf.getBoolean(HiveConf.ConfVars.COMPRESSRESULT.varname, false); if (isCompressed) { String codecStr = clonedConf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATECODEC.varname); if (!StringUtils.isNullOrWhitespaceOnly(codecStr)) { try { Class<? extends CompressionCodec> codec = (Class<? extends CompressionCodec>) Class .forName(codecStr, true, Thread.currentThread().getContextClassLoader()); FileOutputFormat.setOutputCompressorClass(clonedConf, codec); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } String typeStr = clonedConf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE.varname); if (!StringUtils.isNullOrWhitespaceOnly(typeStr)) { SequenceFile.CompressionType style = SequenceFile.CompressionType.valueOf(typeStr); SequenceFileOutputFormat.setOutputCompressionType(clonedConf, style); } } String taskPartition = String.valueOf(clonedConf.getInt("mapreduce.task.partition", -1)); Path taskPath = FileOutputFormat.getTaskOutputPath(clonedConf, taskPartition); FileSinkOperator.RecordWriter recordWriter; try { recordWriter = HiveFileFormatUtils.getRecordWriter(clonedConf, outputFormat, outputClass, isCompressed, tblProperties, taskPath, Reporter.NULL); } catch (HiveException e) { throw new IOException(e); } return new HivePartitionWriter(clonedConf, outputFormat, recordWriter, outputCommitter); }
From source file:org.archive.wayback.hadoop.CDXSort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job.//from w w w . ja v a2 s. c o m * * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { boolean compressOutput = false; boolean dereferenceInputs = false; boolean canonicalize = false; boolean funkyInput = false; JobConf jobConf = new JobConf(getConf(), CDXSort.class); jobConf.setJobName("cdxsort"); jobConf.setMapperClass(IdentityMapper.class); jobConf.setReducerClass(IdentityReducer.class); JobClient client = new JobClient(jobConf); ClusterStatus cluster = client.getClusterStatus(); List<String> otherArgs = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-m".equals(args[i])) { jobConf.setNumMapTasks(Integer.parseInt(args[++i])); } else if ("--compress-output".equals(args[i])) { compressOutput = true; } else if ("--funky-input".equals(args[i])) { funkyInput = true; } else if ("--dereference-inputs".equals(args[i])) { dereferenceInputs = true; } else if ("--canonicalize".equals(args[i])) { canonicalize = true; } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Make sure there are exactly 3 parameters left: split input output if (otherArgs.size() != 3) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 3."); return printUsage(); } String splitPath = otherArgs.get(0); String inputPath = otherArgs.get(1); String outputPath = otherArgs.get(2); // load the split file, find and set the number of reduces AlphaPartitioner partitioner = new AlphaPartitioner(); File localSplitFile = new File(splitPath); FileInputStream fis = new FileInputStream(localSplitFile); InputStreamReader isr = new InputStreamReader(fis, ByteOp.UTF8); BufferedReader bis = new BufferedReader(isr); // try { // partitioner.loadBoundaries(bis); // } catch (IOException except) { // System.err.println("ERROR: Problem loading file " + splitPath); // return printUsage(); // exits // } // jobConf.setNumReduceTasks(partitioner.getNumPartitions()); // // // copy the split file into the FS, add to the DistributedCache: //// AlphaPartitioner.setPartitionFile(jobConf, localSplitFile); // AlphaPartitioner.setSplitCache(jobConf, localSplitFile); // System.err.println("uploaded split file to FS and DistributedCache"); // // // Set job configs: // jobConf.setInputFormat(TextInputFormat.class); // // jobConf.setOutputFormat(TextOutputFormat.class); // if (canonicalize) { // jobConf.setMapperClass(CDXCanonicalizerMapClass.class); // } else { // jobConf.setMapperClass(CDXMapClass.class); // } // jobConf.setOutputKeyClass(Text.class); // jobConf.setOutputValueClass(Text.class); // jobConf.set("mapred.textoutputformat.separator", " "); // jobConf.setPartitionerClass(AlphaPartitioner.class); int inputCount = 0; // Set job input: if (dereferenceInputs) { // SO SLOW... can't add one at a time... // FileReader is2 = new FileReader(new File(inputPath)); // BufferedReader bis2 = new BufferedReader(is2); // while (true) { // String line = bis2.readLine(); // if (line == null) { // break; // } // FileInputFormat.addInputPath(jobConf, new Path(line)); // inputCount++; // System.err.println("Added path(" + inputCount + "): " + line); // } // PASS 2: // FileReader is2 = new FileReader(new File(inputPath)); // BufferedReader bis2 = new BufferedReader(is2); // ArrayList<String> list = new ArrayList<String>(); // // while (true) { // String line = bis2.readLine(); // if (line == null) { // break; // } // list.add(line); // inputCount++; // } // Path arr[] = new Path[list.size()]; // for(int i=0; i < list.size(); i++) { // arr[i] = new Path(list.get(i)); // } // FileInputFormat.setInputPaths(jobConf, arr); // PASS 3: if (funkyInput) { jobConf.setMapperClass(FunkyDeReffingCDXCanonicalizerMapClass.class); } else { jobConf.setMapperClass(DeReffingCDXCanonicalizerMapClass.class); } FileInputFormat.setInputPaths(jobConf, new Path(inputPath)); inputCount = 1; } else { FileInputFormat.setInputPaths(jobConf, new Path(inputPath)); inputCount = 1; } // Set job output: FileOutputFormat.setOutputPath(jobConf, new Path(outputPath)); if (compressOutput) { FileOutputFormat.setCompressOutput(jobConf, true); FileOutputFormat.setOutputCompressorClass(jobConf, GzipCodec.class); } // System.out.println("Running on " + cluster.getTaskTrackers() // + " nodes, processing " + inputCount + " files/directories" // + " into " + outputPath + " with " // + partitioner.getNumPartitions() + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); jobResult = JobClient.runJob(jobConf); Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return 0; }
From source file:org.terrier.utility.io.HadoopUtility.java
License:Mozilla Public License
/** Utility method to set JobOutputCompression if possible. * In general, I find that JobOutputCompression fails for * local job trackers, so this code checks the job tracker * location first./*from ww w . ja v a 2s .c o m*/ * @param conf JobConf of job. * @return true if JobOutputCompression was set. */ public static boolean setJobOutputCompression(JobConf conf) { if (!conf.get("mapred.job.tracker").equals("local")) { FileOutputFormat.setCompressOutput(conf, true); FileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class); return true; } return false; }