List of usage examples for org.apache.hadoop.io.compress CompressionCodec createInputStream
CompressionInputStream createInputStream(InputStream in) throws IOException;
From source file:org.apache.tajo.engine.query.TestInsertQuery.java
License:Apache License
@Test public final void testInsertOverwritePathWithNonFromQuery() throws Exception { ResultSet res = executeString("insert overwrite into location " + "'/tajo-data/testInsertOverwritePathWithNonFromQuery' " + "USING csv WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " + "select 1::INT4, 2.1::FLOAT4, 'test'"); res.close();// w w w. jav a 2 s .co m FileSystem fs = FileSystem.get(testingCluster.getConfiguration()); Path path = new Path("/tajo-data/testInsertOverwritePathWithNonFromQuery"); assertTrue(fs.exists(path)); assertEquals(1, fs.listStatus(path).length); CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration()); FileStatus file = fs.listStatus(path)[0]; CompressionCodec codec = factory.getCodec(file.getPath()); assertTrue(codec instanceof DeflateCodec); BufferedReader reader = new BufferedReader( new InputStreamReader(codec.createInputStream(fs.open(file.getPath())))); try { String line = reader.readLine(); assertNotNull(line); String[] tokens = line.split("\\|"); assertEquals(3, tokens.length); assertEquals("1", tokens[0]); assertEquals("2.1", tokens[1]); assertEquals("test", tokens[2]); } finally { reader.close(); } }
From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HDFSTools.java
License:Apache License
public static void decompressFile(final FileSystem fs, final String inFile, final String outFile, boolean deletePrevious) throws IOException { final Path inPath = new Path(inFile); final Path outPath = new Path(outFile); final CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration()); final CompressionCodec codec = factory.getCodec(inPath); final OutputStream out = fs.create(outPath); final InputStream in = codec.createInputStream(fs.open(inPath)); IOUtils.copyBytes(in, out, 8192);// w w w. j a va2 s . com IOUtils.closeStream(in); IOUtils.closeStream(out); if (deletePrevious) fs.delete(new Path(inFile), true); }
From source file:org.bdgenomics.adam.io.FastqRecordReader.java
License:Apache License
/** * Builds a new record reader given a config file and an input split. * * @param conf The Hadoop configuration object. Used for gaining access * to the underlying file system.//from ww w . j a va 2 s . c om * @param split The file split to read. */ protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException { maxLineLength = conf.getInt(MAX_READ_LENGTH_PROPERTY, DEFAULT_MAX_READ_LENGTH); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); // if our codec is splittable, we can (tentatively) say that // we too are splittable. // // if we get a bgzfenhancedcodec, the codec might not actually // be splittable. however, if we get a non-splittable gz file, // several things happen: // // 1. the input format will detect this, and will not split the // file // 2. the bgzfenhancedcodec will check the underlying data type // (BGZF vs GZIP) at input stream creation time, and will // apply the appropriate codec. // // if we get an unsplittable codec, really all that we do differently // is skip the positioning check, since we know that we're at the // start of the file and can get to reading immediately isSplittable = (codec instanceof SplittableCompressionCodec); if (codec == null) { // no codec. Uncompressed file. int bytesToSkip = positionAtFirstRecord(fileIn, null); inputStream = fileIn; inputStream.skip(bytesToSkip); lineReader = new LineReader(inputStream); } else if (isSplittable) { // file is compressed, but uses a splittable codec isCompressed = true; int bytesToSkip = positionAtFirstRecord(fileIn, codec); // apparent fun finding: if you don't seek back to 0, // SplittableCompressionCodec.createInputStream will seek in the stream // to a start position, and funny things happen.. fileIn.seek(0); inputStream = ((SplittableCompressionCodec) codec).createInputStream(fileIn, codec.createDecompressor(), start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); inputStream.skip(bytesToSkip); lineReader = new ResettableCompressedSplitLineReader((SplitCompressionInputStream) inputStream, conf); } else { // unsplittable compressed file // expect a single split, first record at offset 0 isCompressed = true; inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file lineReader = new LineReader(inputStream); } }
From source file:org.bgi.flexlab.gaea.data.mapreduce.input.adaptor.AdaptorRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; System.out.println(split.toString()); Configuration job = context.getConfiguration(); System.err.println(split.getPath().toString()); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();/*from w w w. j a v a 2 s.c o m*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.bgi.flexlab.gaea.data.mapreduce.input.fastq.FastqBasicReader.java
License:Open Source License
public FastqBasicReader(Configuration job, FileSplit split, byte[] recordDelimiter) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(split.getPath()); String multiSampleList = job.get("multiSampleList"); if (multiSampleList != null && multiSampleList != "") { FastqMultipleSample samplelist;//from www. j a v a 2s . co m samplelist = new FastqMultipleSample(multiSampleList, false); FastqSample slist = samplelist.getID(split.getPath().toString()); if (slist != null) { sampleID = String.valueOf(slist.getId()); } else { sampleID = "+"; } } start = split.getStart(); end = split.getStart() + split.getLength(); // open the file and seek to the start of the split FileSystem fs = split.getPath().getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn), job); } else { in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes); } end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job, this.recordDelimiterBytes); } } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } getFirstFastqLine(); this.pos = start; }
From source file:org.deepak.joins.CustomRecordReader.java
License:Apache License
public CustomRecordReader(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); curSplit = split;/* w w w.j a v a 2s . c o m*/ start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:org.gestore.hadoop.LongRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; job = context.getConfiguration();// w w w . jav a 2 s . c om this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { if (null == this.recordDelimiterBytes) { in = new LineReader(codec.createInputStream(fileIn), job); } else { in = new LineReader(codec.createInputStream(fileIn), job); } end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } if (null == this.recordDelimiterBytes) { in = new LineReader(fileIn, job); } else { in = new LineReader(fileIn, job); } } this.pos = start; lastLine = new Text(); }
From source file:org.godhuli.rhipe.RXLineRecordReader.java
License:Apache License
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from w w w. jav a 2s . co m end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine((new RHText()).getText(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); // linecounter ++ ; } this.pos = start; }
From source file:org.hipi.tools.downloader.Downloader.java
License:Open Source License
public int run(String[] args) throws Exception { // try to parse command line arguments CommandLine line = null;/*from w w w . ja v a 2 s .c om*/ try { line = parser.parse(options, args); } catch (ParseException exp) { usage(); } if (line == null) { usage(); } String[] leftArgs = line.getArgs(); if (leftArgs.length != 2) { usage(); } String inputDir = leftArgs[0]; String outputHib = leftArgs[1]; boolean yfcc100m = line.hasOption("yfcc100m"); int numDownloadNodes = (yfcc100m ? 1 : ((line.hasOption("num-nodes") ? Integer.parseInt(line.getOptionValue("num-nodes")) : 1))); if (numDownloadNodes < 1) { System.err.println("Invalid number of download nodes specified [" + numDownloadNodes + "]"); System.exit(1); } boolean overwrite = line.hasOption("force"); System.out.println("Source directory: " + inputDir); System.out.println("Output HIB: " + outputHib); System.out.println("Overwrite output HIB if it exists: " + (overwrite ? "true" : "false")); System.out.println("YFCC100M format: " + (yfcc100m ? "true" : "false")); System.out.println("Number of download nodes: " + numDownloadNodes); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); // Remove existing HIB if overwrite is specified and HIB exists if (!overwrite) { if (fs.exists(new Path(outputHib))) { System.err.println( "HIB [" + outputHib + "] already exists. Use the \"--force\" argument to overwrite."); System.exit(1); } } else { // overwrite if (fs.exists(new Path(outputHib))) { System.out.println("Found that output HIB already exists, deleting."); } } fs.delete(new Path(outputHib), true); fs.delete(new Path(outputHib + ".dat"), true); fs.delete(new Path(outputHib + "_output"), true); // Scan source directory for list of input files FileStatus[] inputFiles = fs.listStatus(new Path(inputDir)); if (inputFiles == null || inputFiles.length == 0) { System.err.println("Failed to find any files in source directory: " + inputDir); System.exit(1); } // Validate list of input files ArrayList<Path> sourceFiles = new ArrayList<Path>(); for (FileStatus file : inputFiles) { Path path = file.getPath(); if (yfcc100m) { String[] tokens = path.getName().split("-"); if (tokens == null || tokens.length == 0) { System.out.println(" Skipping source file (does not follow YFCC100M file name convention): " + file.getPath()); continue; } } try { // If it exists, get the relevant compression codec CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(path); FSDataInputStream fis = fs.open(path); // If the codec was found, use it to create an decompressed input stream. // Otherwise, assume input stream is already decompressed BufferedReader reader = null; if (codec != null) { reader = new BufferedReader(new InputStreamReader(codec.createInputStream(fis))); } else { reader = new BufferedReader(new InputStreamReader(fis)); } String fileLine = reader.readLine(); String[] lineFields = (yfcc100m ? fileLine.split("\t") : fileLine.split("\\s+")); if (yfcc100m) { if (lineFields.length != 23) { System.out.println(" Skipping source file (does not follow YFCC100M source file format): " + file.getPath()); String imageUri = null; } else { System.out.println(" Adding source file: " + file.getPath()); sourceFiles.add(path); } } else { if (lineFields.length != 1) { System.out.println( " Skipping source file (contains multiple fields per line where only one is expected): " + file.getPath()); if (lineFields.length == 23) { System.out.println(" Did you mean to use \"--yfcc100m\"?"); } String imageUri = null; } else { System.out.println(" Adding source file: " + file.getPath()); sourceFiles.add(path); } } fis.close(); reader = null; } catch (Exception e) { System.err.println("Skipping source file (unable to open and parse first line: " + file.getPath()); continue; } } if (sourceFiles.size() == 0) { System.err.println("Failed to find any valid files in source directory: " + inputDir); System.exit(1); } // Construct path to directory containing outputHib String outputPath = outputHib.substring(0, outputHib.lastIndexOf('/') + 1); // Attaching job parameters to global Configuration object conf.setInt("downloader.nodes", numDownloadNodes); conf.setStrings("downloader.outfile", outputHib); conf.setStrings("downloader.outpath", outputPath); conf.setBoolean("downloader.yfcc100m", yfcc100m); Job job = Job.getInstance(conf, "hibDownload"); job.setJarByClass(Downloader.class); job.setMapperClass(DownloaderMapper.class); job.setReducerClass(DownloaderReducer.class); job.setInputFormatClass(DownloaderInputFormat.class); job.setOutputKeyClass(BooleanWritable.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); FileOutputFormat.setOutputPath(job, new Path(outputHib + "_output")); Path[] inputPaths = new Path[sourceFiles.size()]; inputPaths = sourceFiles.toArray(inputPaths); DownloaderInputFormat.setInputPaths(job, inputPaths); return job.waitForCompletion(true) ? 0 : 1; }
From source file:org.huahinframework.core.lib.input.SimpleRecordReader.java
License:Apache License
/** * {@inheritDoc}/* ww w . j a v a 2 s . c om*/ */ @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } // skip first line and re-establish "start". if (skipFirstLine) { start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.fileName = file.getName(); this.fileLength = fs.getFileStatus(file).getLen(); this.conf = context.getConfiguration(); this.pos = start; this.separator = conf.get(SimpleJob.SEPARATOR, StringUtil.COMMA); this.regex = conf.getBoolean(SimpleJob.SEPARATOR_REGEX, false); init(); }