List of usage examples for org.apache.hadoop.io.compress CompressionCodecFactory getCodec
public CompressionCodec getCodec(Path file)
From source file:org.hedera.util.SeekableInputStream.java
License:Apache License
public static SeekableInputStream getInstance(Path path, long start, long end, FileSystem fs, CompressionCodecFactory compressionCodecs) throws IOException { CompressionCodec codec = compressionCodecs.getCodec(path); FSDataInputStream din = fs.open(path); if (codec != null) { Decompressor decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { SplittableCompressionCodec scodec = (SplittableCompressionCodec) codec; SplitCompressionInputStream cin = scodec.createInputStream(din, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); return new SeekableInputStream(cin); } else {//www. j ava 2s. c om // non-splittable compression input stream // no seeking or offsetting is needed assert start == 0; CompressionInputStream cin = codec.createInputStream(din, decompressor); return new SeekableInputStream(cin, din); } } else { // non compression input stream // we seek to the start of the split din.seek(start); return new SeekableInputStream(din); } }
From source file:org.hipi.tools.downloader.Downloader.java
License:Open Source License
public int run(String[] args) throws Exception { // try to parse command line arguments CommandLine line = null;//from w w w. j a v a 2s .c o m try { line = parser.parse(options, args); } catch (ParseException exp) { usage(); } if (line == null) { usage(); } String[] leftArgs = line.getArgs(); if (leftArgs.length != 2) { usage(); } String inputDir = leftArgs[0]; String outputHib = leftArgs[1]; boolean yfcc100m = line.hasOption("yfcc100m"); int numDownloadNodes = (yfcc100m ? 1 : ((line.hasOption("num-nodes") ? Integer.parseInt(line.getOptionValue("num-nodes")) : 1))); if (numDownloadNodes < 1) { System.err.println("Invalid number of download nodes specified [" + numDownloadNodes + "]"); System.exit(1); } boolean overwrite = line.hasOption("force"); System.out.println("Source directory: " + inputDir); System.out.println("Output HIB: " + outputHib); System.out.println("Overwrite output HIB if it exists: " + (overwrite ? "true" : "false")); System.out.println("YFCC100M format: " + (yfcc100m ? "true" : "false")); System.out.println("Number of download nodes: " + numDownloadNodes); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); // Remove existing HIB if overwrite is specified and HIB exists if (!overwrite) { if (fs.exists(new Path(outputHib))) { System.err.println( "HIB [" + outputHib + "] already exists. Use the \"--force\" argument to overwrite."); System.exit(1); } } else { // overwrite if (fs.exists(new Path(outputHib))) { System.out.println("Found that output HIB already exists, deleting."); } } fs.delete(new Path(outputHib), true); fs.delete(new Path(outputHib + ".dat"), true); fs.delete(new Path(outputHib + "_output"), true); // Scan source directory for list of input files FileStatus[] inputFiles = fs.listStatus(new Path(inputDir)); if (inputFiles == null || inputFiles.length == 0) { System.err.println("Failed to find any files in source directory: " + inputDir); System.exit(1); } // Validate list of input files ArrayList<Path> sourceFiles = new ArrayList<Path>(); for (FileStatus file : inputFiles) { Path path = file.getPath(); if (yfcc100m) { String[] tokens = path.getName().split("-"); if (tokens == null || tokens.length == 0) { System.out.println(" Skipping source file (does not follow YFCC100M file name convention): " + file.getPath()); continue; } } try { // If it exists, get the relevant compression codec CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(path); FSDataInputStream fis = fs.open(path); // If the codec was found, use it to create an decompressed input stream. // Otherwise, assume input stream is already decompressed BufferedReader reader = null; if (codec != null) { reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fis))); } else { reader = new BufferedReader(new InputStreamReader(fis)); } String fileLine = reader.readLine(); String[] lineFields = (yfcc100m ? fileLine.split("\t") : fileLine.split("\\s+")); if (yfcc100m) { if (lineFields.length != 23) { System.out.println(" Skipping source file (does not follow YFCC100M source file format): " + file.getPath()); String imageUri = null; } else { System.out.println(" Adding source file: " + file.getPath()); sourceFiles.add(path); } } else { if (lineFields.length != 1) { System.out.println( " Skipping source file (contains multiple fields per line where only one is expected): " + file.getPath()); if (lineFields.length == 23) { System.out.println(" Did you mean to use \"--yfcc100m\"?"); } String imageUri = null; } else { System.out.println(" Adding source file: " + file.getPath()); sourceFiles.add(path); } } fis.close(); reader = null; } catch (Exception e) { System.err.println("Skipping source file (unable to open and parse first line: " + file.getPath()); continue; } } if (sourceFiles.size() == 0) { System.err.println("Failed to find any valid files in source directory: " + inputDir); System.exit(1); } // Construct path to directory containing outputHib String outputPath = outputHib.substring(0, outputHib.lastIndexOf('/') + 1); // Attaching job parameters to global Configuration object conf.setInt("downloader.nodes", numDownloadNodes); conf.setStrings("downloader.outfile", outputHib); conf.setStrings("downloader.outpath", outputPath); conf.setBoolean("downloader.yfcc100m", yfcc100m); Job job = Job.getInstance(conf, "hibDownload"); job.setJarByClass(Downloader.class); job.setMapperClass(DownloaderMapper.class); job.setReducerClass(DownloaderReducer.class); job.setInputFormatClass(DownloaderInputFormat.class); job.setOutputKeyClass(BooleanWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); FileOutputFormat.setOutputPath(job, new Path(outputHib + "_output")); Path[] inputPaths = new Path[sourceFiles.size()]; inputPaths = sourceFiles.toArray(inputPaths); DownloaderInputFormat.setInputPaths(job, inputPaths); return job.waitForCompletion(true) ? 0 : 1; }
From source file:org.icgc.dcc.release.core.hadoop.FileGlobInputStream.java
License:Open Source License
private static InputStream createDecodedInputStream(FileSystem fileSystem, Path file, CompressionCodecFactory factory) throws IOException { val codec = factory.getCodec(file); val decoded = codec == null; return decoded ? fileSystem.open(file) : codec.createInputStream(fileSystem.open(file)); }
From source file:org.mrgeo.hdfs.ingest.HdfsImageIngestDataProvider.java
License:Apache License
@Override public InputStream openImage() throws IOException { Path path = new Path(getResourceName()); final FileSystem fs = HadoopFileUtils.getFileSystem(conf, path); if (fs.exists(path)) { final InputStream stream = fs.open(path, 131072); // give open a 128K buffer Configuration localConf = HadoopUtils.createConfiguration(); // see if were compressed final CompressionCodecFactory factory = new CompressionCodecFactory(localConf); final CompressionCodec codec = factory.getCodec(path); if (codec != null) { return new HadoopFileUtils.CompressedSeekableStream(codec.createInputStream(stream)); }//from w ww . ja va 2s. c o m return stream; } throw new FileNotFoundException("File not found: " + path.toUri().toString()); }
From source file:org.mrgeo.image.geotools.GeotoolsRasterUtils.java
License:Apache License
private static InputStream openImageStream(String name) throws IOException { Path path = new Path(name); final FileSystem fs = HadoopFileUtils.getFileSystem(path); if (fs.exists(path)) { final InputStream stream = fs.open(path, 131072); // give open a 128K buffer Configuration localConf = HadoopUtils.createConfiguration(); // see if were compressed final CompressionCodecFactory factory = new CompressionCodecFactory(localConf); final CompressionCodec codec = factory.getCodec(path); if (codec != null) { return new HadoopFileUtils.CompressedSeekableStream(codec.createInputStream(stream)); }/*from www . j a v a 2 s .com*/ return stream; } throw new FileNotFoundException("File not found: " + path.toUri().toString()); }
From source file:org.rassee.omniture.hadoop.mapred.OmnitureDataFileRecordReader.java
License:Open Source License
public OmnitureDataFileRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = start + split.getLength(); final Path file = split.getPath(); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // Open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE;/*from w ww . ja v a 2 s. c o m*/ } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } lineReader = new EscapedLineReader(fileIn, job); } if (skipFirstLine) { start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.utils.TarballReader.java
License:Apache License
@Override public void initialize(InputSplit isplit, TaskAttemptContext context) throws IOException, InterruptedException { try {/*from w ww . j a v a 2 s . c o m*/ pos = 0; end = Long.MAX_VALUE; key = new TarballEntry(); value = new Text(); FileSplit split = (FileSplit) isplit; Path file = split.getPath(); tarball = file.getName(); Configuration conf = context.getConfiguration(); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodecs.getCodec(file); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(split.getPath()); in = new TarInputStream(codec.createInputStream(fileIn)); } catch (IOException ex) { Logger.getLogger(TarballReader.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:pl.edu.icm.coansys.commons.pig.udf.RichSequenceFileLoader.java
License:Open Source License
/** * @param path// ww w . j a v a 2s . c om * @param job */ private void setCompression(Path path, Job job) { CompressionCodecFactory codecFactory = new CompressionCodecFactory(job.getConfiguration()); CompressionCodec codec = codecFactory.getCodec(path); if (codec != null) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, codec.getClass()); } else { FileOutputFormat.setCompressOutput(job, false); } }