List of usage examples for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension
String getDefaultExtension();
From source file:com.datascience.hadoop.CsvOutputFormat.java
License:Apache License
@Override public RecordWriter<LongWritable, ListWritable<Text>> getRecordWriter(FileSystem fileSystem, JobConf conf, String name, Progressable progress) throws IOException { String charsetName = conf.get(CHARSET); Charset charset = charsetName != null ? Charset.forName(charsetName) : StandardCharsets.UTF_8; Path path;/* w w w .j a va2s.co m*/ if (FileOutputFormat.getCompressOutput(conf)) { Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(conf, GzipCodec.class); CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf); path = FileOutputFormat.getTaskOutputPath(conf, name + codec.getDefaultExtension()); } else { path = FileOutputFormat.getTaskOutputPath(conf, name); } return new CsvRecordWriter(new OutputStreamWriter(path.getFileSystem(conf).create(path, progress), charset), createFormat(conf)); }
From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java
License:Apache License
public static String getCodecNameFromPath(Configuration conf, String path) { CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecs.getCodec(new Path(path)); if (codec == null) return null; else/*from www .j a va2s . c o m*/ return codec.getDefaultExtension(); }
From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java
License:Apache License
public static OutputStream getCodecOutputStream(Configuration conf, String codecName, OutputStream out) throws IOException { CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); String codecClassName = codecName; CodecType codecType = CodecType.getCodecType(codecName); if (codecType != null) { codecClassName = codecType.getIOCompressionCodecs(); }/*from w w w.j a v a 2 s . c om*/ System.out.println("codec class : " + codecClassName); CompressionCodec codec = compressionCodecs.getCodecByName(codecClassName); if (codec == null) { return out; } System.out.println("Getting OutputStream : " + codec.getDefaultExtension()); System.out.println("Getting OutputStream : " + codec); Compressor compressor = codec.createCompressor(); return codec.createOutputStream(out, compressor); }
From source file:com.flipkart.fdp.migration.distcp.core.MirrorUtils.java
License:Apache License
public static InputStream getCodecInputStream(Configuration conf, String path, InputStream in) throws IOException { CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecs.getCodec(new Path(path)); if (codec == null) return in; System.out.println("Getting InputStream : " + codec.getDefaultExtension()); System.out.println("Getting InputStream : " + codec); Decompressor compressor = codec.createDecompressor(); in = codec.createInputStream(in, compressor); return in;/* w ww .ja v a2 s .c o m*/ }
From source file:com.hdfs.concat.crush.Crush.java
License:Apache License
boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException { job = new JobConf(getConf(), Crush.class); /*/*from w w w. j a v a 2 s . c o m*/ * Turn off speculative execution because that's just wasting network io. */ job.setMapSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); /* * Turn off pre-emption because we don't want to kill a task after two hours of network io. */ job.set("mapred.fairscheduler.preemption", "false"); tmpDir = new Path("tmp/crush-" + UUID.randomUUID()); outDir = new Path(tmpDir, "out"); double threshold = 0.75; List<String> regexes = asList(".+"); List<String> replacements = asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}"); List<String> inFormats = asList(SequenceFileInputFormat.class.getName()); List<String> outFormats = asList(SequenceFileOutputFormat.class.getName()); String crushTimestamp; Options options = buildOptions(); CommandLine cli = new GnuParser().parse(options, args); if (cli.hasOption("?")) { BufferedReader reader = new BufferedReader( new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt"))); try { String line; while (null != (line = reader.readLine())) { System.out.println(line); } } finally { reader.close(); } return false; } if (cli.hasOption("verbose")) { console = Verbosity.VERBOSE; } else if (cli.hasOption("info")) { console = Verbosity.INFO; } else { console = Verbosity.NONE; } if (cli.hasOption("ignore-regex")) { ignoredFiles = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher(""); } excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs"); String[] nonOptions = cli.getArgs(); if (2 == nonOptions.length) { /* * Stand alone mode accepts two arguments. */ mode = Mode.STAND_ALONE; srcDir = new Path(nonOptions[0]); dest = new Path(nonOptions[1]); if (cli.hasOption("input-format")) { inFormats = asList(cli.getOptionValue("input-format")); } if (cli.hasOption("output-format")) { outFormats = asList(cli.getOptionValue("output-format")); } replacements = asList(dest.getName()); crushTimestamp = Long.toString(currentTimeMillis()); } else { /* * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much * smaller. */ if ((args.length == 4 || args.length == 3) && args.length == nonOptions.length && args[2].length() != 14) { int maxTasks = Integer.parseInt(args[2]); if (maxTasks <= 0 || maxTasks > 4000) { throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks); } job.setInt("mapred.reduce.tasks", maxTasks); maxFileBlocks = Integer.MAX_VALUE; crushTimestamp = Long.toString(currentTimeMillis()); srcDir = new Path(args[0]); dest = new Path(args[1]); mode = Mode.CLONE; if (args.length == 4) { if (args[3].equals("TEXT")) { /* * These are the defaults except with text input and output formats. */ inFormats = asList(TextInputFormat.class.getName()); outFormats = asList(TextOutputFormat.class.getName()); } else if (!args[3].equals("SEQUENCE")) { throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]); } } } else { /* * V2 style arguments. */ if (cli.hasOption("threshold")) { threshold = Double.parseDouble(cli.getOptionValue("threshold")); if (0 >= threshold || 1 < threshold || Double.isInfinite(threshold) || Double.isNaN(threshold)) { throw new IllegalArgumentException("Block size threshold must be in (0, 1]: " + threshold); } } if (cli.hasOption("max-file-blocks")) { int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks")); if (0 > maxFileBlocksOption) { throw new IllegalArgumentException( "Maximum file size in blocks must be positive: " + maxFileBlocksOption); } maxFileBlocks = maxFileBlocksOption; } else { maxFileBlocks = 8; } if (cli.hasOption("regex")) { regexes = asList(cli.getOptionValues("regex")); } if (cli.hasOption("replacement")) { replacements = asList(cli.getOptionValues("replacement")); } if (cli.hasOption("input-format")) { inFormats = asList(cli.getOptionValues("input-format")); } if (cli.hasOption("output-format")) { outFormats = asList(cli.getOptionValues("output-format")); } if (3 != nonOptions.length) { throw new IllegalArgumentException( "Could not find source directory, out directory, and job timestamp"); } srcDir = new Path(nonOptions[0]); dest = new Path(nonOptions[1]); crushTimestamp = nonOptions[2]; if (cli.hasOption("clone")) { mode = Mode.CLONE; } else { mode = Mode.MAP_REDUCE; } if (!crushTimestamp.matches("\\d{14}")) { throw new IllegalArgumentException( "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp); } } dfsBlockSize = parseDfsBlockSize(job); maxEligibleSize = (long) (dfsBlockSize * threshold); } /* * Add the crush specs and compression options to the configuration. */ job.set("crush.timestamp", crushTimestamp); if (ignoredFiles != null) { job.set("crush.ignore-regex", ignoredFiles.pattern().pattern()); } if (regexes.size() != replacements.size() || replacements.size() != inFormats.size() || inFormats.size() != outFormats.size()) { throw new IllegalArgumentException( "Must be an equal number of regex, replacement, in-format, and out-format options"); } job.setInt("crush.num.specs", regexes.size()); matchers = new ArrayList<Matcher>(regexes.size()); for (int i = 0; i < regexes.size(); i++) { job.set(format("crush.%d.regex", i), regexes.get(i)); matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy")); job.set(format("crush.%d.regex.replacement", i), replacements.get(i)); String inFmt = inFormats.get(i); if ("sequence".equals(inFmt)) { inFmt = SequenceFileInputFormat.class.getName(); } else if ("text".equals(inFmt)) { inFmt = TextInputFormat.class.getName(); } else { try { if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) { throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt); } } job.set(format("crush.%d.input.format", i), inFmt); String outFmt = outFormats.get(i); if ("sequence".equals(outFmt)) { outFmt = SequenceFileOutputFormat.class.getName(); } else if ("text".equals(outFmt)) { outFmt = TextOutputFormat.class.getName(); } else { try { if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) { throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt); } } job.set(format("crush.%d.output.format", i), outFmt); } String codec = cli.getOptionValue("compress"); if (null == codec) { codec = DefaultCodec.class.getName(); } else if ("none".equals(codec)) { codec = null; } else if ("gzip".equals(codec)) { codec = GzipCodec.class.getName(); } else { try { if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) { throw new IllegalArgumentException("Not a CompressionCodec: " + codec); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a CompressionCodec: " + codec); } } if (null == codec) { job.setBoolean("mapred.output.compress", false); } else { job.setBoolean("mapred.output.compress", true); job.set("mapred.output.compression.type", "BLOCK"); job.set("mapred.output.compression.codec", codec); try { CompressionCodec instance = (CompressionCodec) Class.forName(codec).newInstance(); codecExtension = instance.getDefaultExtension(); } catch (Exception e) { throw new AssertionError(); } } return true; }
From source file:com.hdfs.concat.crush.integration.CrushMapReduceTest.java
License:Apache License
/** * Verifies that the work dir has the expected output. *///from ww w .j a v a 2 s . c o m private void verifyOutput(String dir, String crushOutMask, Format inFmt, Format outFmt, CompressionCodec codec, String... fileNames) throws IOException { /* * Read format table * * \ out format * \ * in format \ seq | text * ---------------------------- * seq | Custom | ascii | * -------------------------- - * text | Text | ascii | * ---------------------------- */ if (Format.TEXT == outFmt) { /* * TextInputFormat will produce keys that are byte offsets and values that are the line. This is not actually what we want. * We want to preserve the actual keys and values in the files, just like SequenceFileInputFormat. So, either way, the * keys and values should be the text representations of what went in. */ BufferedReader reader; Path crushOut; if (null == codec) { Path path = new Path(dir + "/" + crushOutMask); FileStatus[] globStatus = getFileSystem().globStatus(path); if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) { fail(crushOutMask + " was not found in " + path); } crushOut = globStatus[0].getPath(); reader = new BufferedReader(new InputStreamReader(getFileSystem().open(crushOut))); } else { Path path = new Path(dir + "/" + crushOutMask + codec.getDefaultExtension()); FileStatus[] globStatus = getFileSystem().globStatus(path); if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) { fail(crushOutMask); } crushOut = globStatus[0].getPath(); reader = new BufferedReader( new InputStreamReader(codec.createInputStream(getFileSystem().open(crushOut)))); } Set<String> expected = new HashSet<String>(); Set<String> actual = new HashSet<String>(); for (String fileName : fileNames) { int max = Integer.parseInt(fileName.substring(4)); for (int key = 1, value = max * 100 + 1; key <= max; key++, value++) { String expectedLine = String.format("%d\t%d", key, value); assertThat(expectedLine, expected.add(expectedLine), is(true)); String actualLine = reader.readLine(); assertThat(actualLine, actual.add(actualLine), is(true)); } } assertThat("Should be at end of crush output file " + crushOut, reader.readLine(), nullValue()); reader.close(); assertThat(actual, equalTo(expected)); } else if (Format.SEQUENCE == inFmt && Format.SEQUENCE == outFmt) { /* * Record reader will produce keys that are custom writables and values that are custom writable. */ FileStatus[] globStatus = getFileSystem().globStatus(new Path(dir + "/" + crushOutMask)); if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) { fail(crushOutMask); } Path crushOut = globStatus[0].getPath(); Reader reader = new Reader(getFileSystem(), crushOut, getFileSystem().getConf()); assertThat(reader.isBlockCompressed(), is(true)); assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) codec.getClass())); CustomWritable key = new CustomWritable(); CustomWritable value = new CustomWritable(); Set<String> expected = new HashSet<String>(); Set<String> actual = new HashSet<String>(); for (String fileName : fileNames) { int max = Integer.parseInt(fileName.substring(4)); for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) { reader.next(key, value); assertThat(expected.add(String.format("%s\t%s", k, v)), is(true)); assertThat(actual.add(String.format("%s\t%s", key, value)), is(true)); } } assertThat("Should be at end of crush output file " + crushOut, reader.next(key, value), is(false)); reader.close(); assertThat(actual, equalTo(expected)); } else if (Format.TEXT == inFmt && Format.SEQUENCE == outFmt) { FileStatus[] globStatus = getFileSystem().globStatus(new Path(dir + "/" + crushOutMask)); if (globStatus == null || 1 != globStatus.length || globStatus[0].isDir()) { fail(crushOutMask); } Path crushOut = globStatus[0].getPath(); Reader reader = new Reader(getFileSystem(), crushOut, getFileSystem().getConf()); assertThat(reader.isCompressed(), is(true)); assertThat(reader.isBlockCompressed(), is(true)); assertThat(reader.getCompressionCodec().getClass(), equalTo((Object) codec.getClass())); Text key = new Text(); Text value = new Text(); Set<String> expected = new HashSet<String>(); Set<String> actual = new HashSet<String>(); for (String fileName : fileNames) { int max = Integer.parseInt(fileName.substring(4)); for (int k = 1, v = max * 100 + 1; k <= max; k++, v++) { reader.next(key, value); assertThat(expected.add(String.format("%s\t%s", k, v)), is(true)); assertThat(actual.add(String.format("%s\t%s", key, value)), is(true)); } } assertThat("Should be at end of crush output file " + crushOut, reader.next(key, value), is(false)); reader.close(); assertThat(actual, equalTo(expected)); } else { fail(); } }
From source file:com.hp.hpit.cs.MyTextOutputFormat.java
License:Apache License
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); String keyValueSeparator = conf.get("mapred.textoutputformat.separator", "\t"); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); }/* w ww.j a v a 2s. co m*/ Path file = getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(fileOut, keyValueSeparator); } else { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator); } }
From source file:com.inmobi.conduit.CompressedFileReaderTest.java
License:Apache License
private void uncompress(String fileName) throws Exception { Configuration conf = new Configuration(); FileSystem fs;/* ww w . j a v a2s . c om*/ fs = FileSystem.getLocal(conf); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(new Path(fileName)); if (codec == null) { System.out.println("cant find codec"); System.exit(1); } LOG.info("Using compression codec [" + codec.toString() + "]"); CompressionInputStream is = codec.createInputStream(fs.open(new Path(fileName))); OutputStream out = null; try { String outputURI = CompressionCodecFactory.removeSuffix(fileName, codec.getDefaultExtension()); out = fs.create(new Path(outputURI + "-uncompressed")); org.apache.hadoop.io.IOUtils.copyBytes(is, out, conf); } finally { org.apache.hadoop.io.IOUtils.closeStream(out); IOUtils.closeStream(is); } }
From source file:com.jeffy.hdfs.compression.FileCompressor.java
License:Apache License
/** * @param args//from w ww . jav a 2 s . com * ?????? * ???? * @throws IOException */ public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); //?? CompressionCodecFactory factory = new CompressionCodecFactory(conf); // For example for the 'GzipCodec' codec class name the alias are 'gzip' and 'gzipcodec'. CompressionCodec codec = factory.getCodecByName(args[0]); if (codec == null) {//??? System.err.println("Comperssion codec not found for " + args[0]); System.exit(1); } String ext = codec.getDefaultExtension(); Compressor compressor = null; try { //?CodecPool?Compressor compressor = CodecPool.getCompressor(codec); for (int i = 1; i < args.length; i++) { String filename = args[i] + ext; System.out.println("Compression the file " + filename); try (FileSystem outFs = FileSystem.get(URI.create(filename), conf); FileSystem inFs = FileSystem.get(URI.create(args[i]), conf); InputStream in = inFs.open(new Path(args[i]))) {// //Compressor? CompressionOutputStream out = codec.createOutputStream(outFs.create(new Path(filename)), compressor); //????? IOUtils.copy(in, out); out.finish();//?finish()?flush()??? compressor.reset(); //???????java.io.IOException: write beyond end of stream } } } finally {//?Compressor?? CodecPool.returnCompressor(compressor); } }
From source file:com.jeffy.hdfs.compression.FileDecompressor.java
License:Apache License
/** * @param args/*from ww w . ja v a 2s .c o m*/ * * @throws IOException */ public static void main(String[] args) throws IOException { //?? Configuration conf = new Configuration(); // ? CompressionCodecFactory factory = new CompressionCodecFactory(conf); for (String uri : args) { FileSystem fs = FileSystem.get(URI.create(uri), conf); Path inputPath = new Path(uri); // ??????io.compression.codecs CompressionCodec codec = factory.getCodec(inputPath); // ?? if (codec == null) { System.err.println("No codec found for " + uri); continue; } String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension()); try (InputStream in = codec.createInputStream(fs.open(inputPath)); OutputStream out = fs.create(new Path(outputUri))) { IOUtils.copyBytes(in, out, conf); } } }