List of usage examples for org.apache.hadoop.io.compress CompressionCodec getDefaultExtension
String getDefaultExtension();
From source file:com.linkedin.cubert.io.rubix.RubixOutputFormat.java
License:Open Source License
@Override public RecordWriter<K, V> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); String extension = RubixConstants.RUBIX_EXTENSION; CompressionCodec codec = null; boolean isCompressed = getCompressOutput(context); if (isCompressed) { Class<?> codecClass = getOutputCompressorClass(context, DefaultCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension += codec.getDefaultExtension(); }//from ww w.j av a2 s . c om Path file = getDefaultWorkFile(context, extension); FileSystem fs = file.getFileSystem(conf); FSDataOutputStream fileOut = fs.create(file, false); return new RubixRecordWriter<K, V>(conf, fileOut, context.getOutputKeyClass(), context.getOutputValueClass(), codec); }
From source file:com.m6d.filecrush.crush.Crush.java
License:Apache License
boolean createJobConfAndParseArgs(String... args) throws ParseException, IOException { job = new JobConf(getConf(), Crush.class); /*/*from w w w . j a va 2 s .com*/ * Turn off speculative execution because that's just wasting network io. */ job.setMapSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); /* * Turn off pre-emption because we don't want to kill a task after two hours of network io. */ job.set("mapred.fairscheduler.preemption", "false"); tmpDir = new Path("tmp/crush-" + UUID.randomUUID()); outDir = new Path(tmpDir, "out"); double threshold = 0.75; List<String> regexes = asList(".+"); List<String> replacements = asList("crushed_file-${crush.timestamp}-${crush.task.num}-${crush.file.num}"); List<String> inFormats = asList(SequenceFileInputFormat.class.getName()); List<String> outFormats = asList(SequenceFileOutputFormat.class.getName()); String crushTimestamp; Options options = buildOptions(); CommandLine cli = new GnuParser().parse(options, args); if (cli.hasOption("?")) { BufferedReader reader = new BufferedReader( new InputStreamReader(getClass().getClassLoader().getResourceAsStream("help.txt"))); try { String line; while (null != (line = reader.readLine())) { System.out.println(line); } } finally { reader.close(); } return false; } if (cli.hasOption("verbose")) { console = Verbosity.VERBOSE; } else if (cli.hasOption("info")) { console = Verbosity.INFO; } else { console = Verbosity.NONE; } if (cli.hasOption("ignore-regex")) { ignoredFilesMatcher = Pattern.compile(cli.getOptionValue("ignore-regex")).matcher(""); } if (cli.hasOption("skip-regex")) { skippedFilesMatcher = Pattern.compile(cli.getOptionValue("skip-regex")).matcher(""); } if (cli.hasOption("max-tasks")) { maxTasks = Integer.parseInt(cli.getOptionValue("max-tasks")); } if (cli.hasOption("job-name")) { job.set("mapreduce.job.name", cli.getOptionValue("job-name")); } removeEmptyFiles = cli.hasOption("remove-empty-files"); excludeSingleFileDirs = !cli.hasOption("include-single-file-dirs"); String[] nonOptions = cli.getArgs(); if (2 == nonOptions.length) { /* * Stand alone mode accepts two arguments. */ mode = Mode.STAND_ALONE; srcDir = new Path(nonOptions[0]); dest = new Path(nonOptions[1]); if (cli.hasOption("input-format")) { inFormats = asList(cli.getOptionValue("input-format")); } if (cli.hasOption("output-format")) { outFormats = asList(cli.getOptionValue("output-format")); } replacements = asList(dest.getName()); crushTimestamp = Long.toString(currentTimeMillis()); } else { /* * The previous version expected three or four arguments. The third one specified the number of tasks to use, which is an * integral number, just like the third argument in the new version, which is a timestamp. We tell the two apart by looking * at the value of the argument. A timestamp is going to be a huge, 14-digit number while the number of tasks should be much * smaller. */ if ((args.length == 4 || args.length == 3) && args.length == nonOptions.length && args[2].length() != 14) { int maxTasks = Integer.parseInt(args[2]); if (maxTasks <= 0 || maxTasks > 4000) { throw new IllegalArgumentException("Tasks must be in the range [1, 4000]: " + maxTasks); } job.setInt("mapreduce.job.reduces", maxTasks); maxFileBlocks = Integer.MAX_VALUE; crushTimestamp = Long.toString(currentTimeMillis()); srcDir = new Path(args[0]); dest = new Path(args[1]); mode = Mode.CLONE; if (args.length == 4) { if (args[3].equals("TEXT")) { /* * These are the defaults except with text input and output formats. */ inFormats = asList(TextInputFormat.class.getName()); outFormats = asList(TextOutputFormat.class.getName()); } else if (!args[3].equals("SEQUENCE")) { throw new IllegalArgumentException("Type must be either TEXT or SEQUENCE: " + args[3]); } } } else { /* * V2 style arguments. */ if (cli.hasOption("threshold")) { threshold = Double.parseDouble(cli.getOptionValue("threshold")); if (0 >= threshold || 1 < threshold || Double.isInfinite(threshold) || Double.isNaN(threshold)) { throw new IllegalArgumentException("Block size threshold must be in (0, 1]: " + threshold); } } if (cli.hasOption("max-file-blocks")) { int maxFileBlocksOption = Integer.parseInt(cli.getOptionValue("max-file-blocks")); if (0 > maxFileBlocksOption) { throw new IllegalArgumentException( "Maximum file size in blocks must be positive: " + maxFileBlocksOption); } maxFileBlocks = maxFileBlocksOption; } else { maxFileBlocks = 8; } if (cli.hasOption("regex")) { regexes = asList(cli.getOptionValues("regex")); } if (cli.hasOption("replacement")) { replacements = asList(cli.getOptionValues("replacement")); } if (cli.hasOption("input-format")) { inFormats = asList(cli.getOptionValues("input-format")); } if (cli.hasOption("output-format")) { outFormats = asList(cli.getOptionValues("output-format")); } if (3 != nonOptions.length) { throw new IllegalArgumentException( "Could not find source directory, out directory, and job timestamp"); } srcDir = new Path(nonOptions[0]); dest = new Path(nonOptions[1]); crushTimestamp = nonOptions[2]; if (cli.hasOption("clone")) { mode = Mode.CLONE; } else { mode = Mode.MAP_REDUCE; } if (!crushTimestamp.matches("\\d{14}")) { throw new IllegalArgumentException( "Crush timestamp must be 14 digits yyyymmddhhMMss: " + crushTimestamp); } } dfsBlockSize = parseDfsBlockSize(job); maxEligibleSize = (long) (dfsBlockSize * threshold); print(Verbosity.INFO, format("\nSmall file threshold: " + NumberFormat.getNumberInstance(Locale.US).format(maxEligibleSize) + " bytes\n")); } /* * Add the crush specs and compression options to the configuration. */ job.set("crush.timestamp", crushTimestamp); if (ignoredFilesMatcher != null) { job.set("crush.ignore-regex", ignoredFilesMatcher.pattern().pattern()); } if (skippedFilesMatcher != null) { job.set("crush.skip-regex", skippedFilesMatcher.pattern().pattern()); } if (regexes.size() != replacements.size() || replacements.size() != inFormats.size() || inFormats.size() != outFormats.size()) { throw new IllegalArgumentException( "Must be an equal number of regex, replacement, in-format, and out-format options"); } job.setInt("crush.num.specs", regexes.size()); matchers = new ArrayList<Matcher>(regexes.size()); for (int i = 0; i < regexes.size(); i++) { job.set(format("crush.%d.regex", i), regexes.get(i)); matchers.add(Pattern.compile(regexes.get(i)).matcher("dummy")); job.set(format("crush.%d.regex.replacement", i), replacements.get(i)); String inFmt = inFormats.get(i); if ("sequence".equals(inFmt)) { inFmt = SequenceFileInputFormat.class.getName(); } else if ("text".equals(inFmt)) { inFmt = TextInputFormat.class.getName(); } else if ("avro".equals(inFmt)) { inFmt = AvroContainerInputFormat.class.getName(); } else if ("parquet".equals(inFmt)) { inFmt = MapredParquetInputFormat.class.getName(); } else { try { if (!FileInputFormat.class.isAssignableFrom(Class.forName(inFmt))) { throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a FileInputFormat:" + inFmt); } } job.set(format("crush.%d.input.format", i), inFmt); String outFmt = outFormats.get(i); if ("sequence".equals(outFmt)) { outFmt = SequenceFileOutputFormat.class.getName(); } else if ("text".equals(outFmt)) { outFmt = TextOutputFormat.class.getName(); } else if ("avro".equals(outFmt)) { outFmt = AvroContainerOutputFormat.class.getName(); } else if ("parquet".equals(outFmt)) { outFmt = MapredParquetOutputFormat.class.getName(); } else { try { if (!FileOutputFormat.class.isAssignableFrom(Class.forName(outFmt))) { throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a FileOutputFormat:" + outFmt); } } job.set(format("crush.%d.output.format", i), outFmt); } String codec = cli.getOptionValue("compress"); String codecClassName = null; if (null == codec || "deflate".equals(codec)) { codecClassName = DefaultCodec.class.getName(); } else if ("none".equals(codec)) { codecClassName = null; } else if ("gzip".equals(codec)) { codecClassName = GzipCodec.class.getName(); } else if ("snappy".equals(codec)) { codecClassName = SnappyCodec.class.getName(); } else if ("bzip2".equals(codec)) { codecClassName = BZip2Codec.class.getName(); } else { codecClassName = codec; try { if (!CompressionCodec.class.isAssignableFrom(Class.forName(codec))) { throw new IllegalArgumentException("Not a CompressionCodec: " + codec); } } catch (ClassNotFoundException e) { throw new IllegalArgumentException("Not a CompressionCodec: " + codec); } } if (null == codecClassName) { job.setBoolean("mapreduce.output.fileoutputformat.compress", false); job.set("avro.output.codec", "null"); job.set("parquet.compression", "uncompressed"); } else { job.setBoolean("mapreduce.output.fileoutputformat.compress", true); job.set("mapreduce.output.fileoutputformat.compress.type", "BLOCK"); job.set("mapreduce.output.fileoutputformat.compress.codec", codecClassName); job.set("avro.output.codec", codec); job.set("parquet.compression", codec); try { CompressionCodec instance = (CompressionCodec) Class.forName(codecClassName).newInstance(); codecExtension = instance.getDefaultExtension(); } catch (Exception e) { throw new AssertionError(); } } return true; }
From source file:com.m6d.filecrush.crush.TextOutputFormat.java
License:Apache License
public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { boolean isCompressed = getCompressOutput(job); String keyValueSeparator = job.get("mapreduce.output.textoutputformat.separator", ""); if (!isCompressed) { Path file = FileOutputFormat.getTaskOutputPath(job, name); FileSystem fs = file.getFileSystem(job); FSDataOutputStream fileOut = fs.create(file, progress); return new LineRecordWriter<K, V>(fileOut, keyValueSeparator); } else {/*from w w w . j av a 2 s .c o m*/ Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); // create the named codec CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job); // build the filename including the extension Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension()); FileSystem fs = file.getFileSystem(job); FSDataOutputStream fileOut = fs.create(file, progress); return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator); } }
From source file:com.metamx.druid.indexer.Utils.java
License:Open Source License
public static OutputStream makePathAndOutputStream(JobContext job, Path outputPath, boolean deleteExisting) throws IOException { OutputStream retVal;/* w ww .j a v a 2 s .c o m*/ FileSystem fs = outputPath.getFileSystem(job.getConfiguration()); if (fs.exists(outputPath)) { if (deleteExisting) { fs.delete(outputPath, false); } else { throw new ISE("outputPath[%s] must not exist.", outputPath); } } if (!FileOutputFormat.getCompressOutput(job)) { retVal = fs.create(outputPath, false); } else { Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(job, GzipCodec.class); CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job.getConfiguration()); outputPath = new Path(outputPath.toString() + codec.getDefaultExtension()); retVal = codec.createOutputStream(fs.create(outputPath, false)); } return retVal; }
From source file:com.panguso.lc.analysis.format.mapreduce.TextOutputFormat.java
License:Open Source License
/** * @param job job/*from w w w. ja v a 2s .c o m*/ * @throws IOException IOException * @throws InterruptedException InterruptedException */ public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); String keyValueSeparator = conf.get("mapred.textoutputformat.separator", separate); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); } Path file = getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(fileOut, keyValueSeparator); } else { FSDataOutputStream fileOut = fs.create(file, false); return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator); } }
From source file:com.pinterest.secor.parser.PartitionFinalizer.java
License:Apache License
public PartitionFinalizer(SecorConfig config) throws Exception { mConfig = config;// w w w.ja v a2s . co m mKafkaClient = new KafkaClient(mConfig); mZookeeperConnector = new ZookeeperConnector(mConfig); mThriftMessageParser = new ThriftMessageParser(mConfig); mQuboleClient = new QuboleClient(mConfig); if (mConfig.getCompressionCodec() != null && !mConfig.getCompressionCodec().isEmpty()) { CompressionCodec codec = (CompressionCodec) ReflectionUtil .createCompressionCodec(mConfig.getCompressionCodec()); mFileExtension = codec.getDefaultExtension(); } else { mFileExtension = ""; } }
From source file:com.pinterest.secor.uploader.Uploader.java
License:Apache License
private void trim(LogFilePath srcPath, long startOffset) throws Exception { if (startOffset == srcPath.getOffset()) { return;/*from w w w . ja v a2 s .c o m*/ } Configuration config = new Configuration(); FileSystem fs = FileSystem.get(config); String srcFilename = srcPath.getLogFilePath(); Path srcFsPath = new Path(srcFilename); SequenceFile.Reader reader = null; SequenceFile.Writer writer = null; LogFilePath dstPath = null; int copiedMessages = 0; // Deleting the writer closes its stream flushing all pending data to the disk. mFileRegistry.deleteWriter(srcPath); try { reader = createReader(fs, srcFsPath, config); LongWritable key = (LongWritable) reader.getKeyClass().newInstance(); BytesWritable value = (BytesWritable) reader.getValueClass().newInstance(); CompressionCodec codec = null; String extension = ""; if (mConfig.getCompressionCodec() != null && !mConfig.getCompressionCodec().isEmpty()) { codec = (CompressionCodec) ReflectionUtil.createCompressionCodec(mConfig.getCompressionCodec()); extension = codec.getDefaultExtension(); } while (reader.next(key, value)) { if (key.get() >= startOffset) { if (writer == null) { String localPrefix = mConfig.getLocalPath() + '/' + IdUtil.getLocalMessageDir(); dstPath = new LogFilePath(localPrefix, srcPath.getTopic(), srcPath.getPartitions(), srcPath.getGeneration(), srcPath.getKafkaPartition(), startOffset, extension); writer = mFileRegistry.getOrCreateWriter(dstPath, codec); } writer.append(key, value); copiedMessages++; } } } finally { if (reader != null) { reader.close(); } } mFileRegistry.deletePath(srcPath); if (dstPath == null) { LOG.info("removed file " + srcPath.getLogFilePath()); } else { LOG.info("trimmed " + copiedMessages + " messages from " + srcFilename + " to " + dstPath.getLogFilePath() + " with start offset " + startOffset); } }
From source file:com.ricemap.spateDB.core.GridRecordWriter.java
License:Apache License
/** * Returns path to a file in which the final cell will be written. * @param column// w ww . ja v a2s .c o m * @param row * @return * @throws IOException */ protected Path getFinalCellPath(int cellIndex) throws IOException { Path path = null; do { String filename = counter == 0 ? String.format("data_%05d", cellIndex) : String.format("data_%05d_%d", cellIndex, counter); boolean isCompressed = jobConf != null && FileOutputFormat.getCompressOutput(jobConf); if (isCompressed) { Class<? extends CompressionCodec> codecClass = FileOutputFormat.getOutputCompressorClass(jobConf, GzipCodec.class); // create the named codec CompressionCodec codec = ReflectionUtils.newInstance(codecClass, jobConf); filename += codec.getDefaultExtension(); } path = getFilePath(filename); counter++; } while (fileSystem.exists(path)); return path; }
From source file:com.ricemap.spateDB.mapred.TextOutputFormat.java
License:Apache License
public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { boolean isCompressed = getCompressOutput(job); String keyValueSeparator = job.get("mapred.textoutputformat.separator", "\t"); if (!isCompressed) { Path file = FileOutputFormat.getTaskOutputPath(job, name); FileSystem fs = file.getFileSystem(job); FSDataOutputStream fileOut = fs.create(file, progress); return new LineRecordWriter<K, V>(fileOut, keyValueSeparator); } else {//from ww w. ja va 2 s . c o m Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); // create the named codec CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job); // build the filename including the extension Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension()); FileSystem fs = file.getFileSystem(job); FSDataOutputStream fileOut = fs.create(file, progress); return new LineRecordWriter<K, V>(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator); } }
From source file:com.streamsets.pipeline.stage.destination.hdfs.writer.TestRecordWriterManager.java
License:Apache License
private void testPath(CompressionCodec compressionCodec) throws Exception { URI uri = new URI("file:///"); Configuration conf = new HdfsConfiguration(); String prefix = "prefix"; String template = getTestDir().toString() + "/${YYYY()}/${YY()}/${MM()}/${DD()}/${hh()}/${mm()}/${ss()}/${record:value('/')}"; TimeZone timeZone = TimeZone.getTimeZone("UTC"); long cutOffSecs = 10; long cutOffSize = 20; long cutOffRecords = 2; HdfsFileType fileType = HdfsFileType.TEXT; SequenceFile.CompressionType compressionType = (compressionCodec == null) ? SequenceFile.CompressionType.NONE : SequenceFile.CompressionType.BLOCK; String keyEL = "uuid()"; DataGeneratorFactory generatorFactory = new DummyDataGeneratorFactory(null); RecordWriterManager mgr = new RecordWriterManager(uri, conf, prefix, template, timeZone, cutOffSecs, cutOffSize, cutOffRecords, fileType, compressionCodec, compressionType, keyEL, generatorFactory, targetContext, "dirPathTemplate"); Assert.assertTrue(mgr.validateDirTemplate("g", "dirPathTemplate", new ArrayList<Stage.ConfigIssue>())); if (compressionCodec == null) { Assert.assertEquals("", mgr.getExtension()); } else {// w w w.ja va2s.c o m Assert.assertEquals(compressionCodec.getDefaultExtension(), mgr.getExtension()); } Date date = getFixedDate(); Record record = RecordCreator.create(); record.set(Field.create("a")); Assert.assertTrue(mgr.getPath(date, record).toString() .startsWith(new Path(getTestDir(), "2015/15/01/20/09/56/01/a/_tmp_" + prefix).toString())); }