List of usage examples for org.apache.hadoop.io.compress CompressionCodecFactory getCodec
public CompressionCodec getCodec(Path file)
From source file:io.prestosql.plugin.hive.HiveUtil.java
License:Apache License
public static Optional<CompressionCodec> getCompressionCodec(TextInputFormat inputFormat, Path file) { CompressionCodecFactory compressionCodecFactory; try {//from w w w.ja va 2s. c o m compressionCodecFactory = (CompressionCodecFactory) COMPRESSION_CODECS_FIELD.get(inputFormat); } catch (IllegalAccessException e) { throw new PrestoException(GENERIC_INTERNAL_ERROR, "Failed to find compressionCodec for inputFormat: " + inputFormat.getClass().getName(), e); } if (compressionCodecFactory == null) { return Optional.empty(); } return Optional.ofNullable(compressionCodecFactory.getCodec(file)); }
From source file:ml.shifu.shifu.util.HdfsGlobalFile.java
License:Apache License
private InputStream openPartFileAsStream(FileStatus fileStatus) throws IOException { CompressionCodecFactory compressionFactory = new CompressionCodecFactory(new Configuration()); InputStream is = null;/*from ww w . j a v a2 s. c om*/ FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType); CompressionCodec codec = compressionFactory.getCodec(fileStatus.getPath()); if (codec != null) { is = codec.createInputStream(fs.open(fileStatus.getPath())); } else { is = fs.open(fileStatus.getPath()); } return is; }
From source file:nyu.cs.webgraph.MRhelpers.LzoTabSeperatedLineRecordReader.java
License:Open Source License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; start = split.getStart();/*w w w.j a v a 2 s.c o m*/ end = start + split.getLength(); final Path file = split.getPath(); Configuration job = context.getConfiguration(); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("Codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header in = new LineReader(codec.createInputStream(fileIn), job); if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:org.apache.jena.hadoop.rdf.io.input.readers.AbstractBlockBasedNodeTupleReader.java
License:Apache License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { LOG.debug("initialize({}, {})", genericSplit, context); // Assuming file split if (!(genericSplit instanceof FileSplit)) throw new IOException("This record reader only supports FileSplit inputs"); FileSplit split = (FileSplit) genericSplit; // Configuration Configuration config = context.getConfiguration(); this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true); if (this.ignoreBadTuples) LOG.warn(/* w w w. ja v a 2s. c om*/ "Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown. Consider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES); // Figure out what portion of the file to read start = split.getStart(); long end = start + split.getLength(); final Path file = split.getPath(); long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen(); boolean readToEnd = end == totalLength; CompressionCodecFactory factory = new CompressionCodecFactory(config); this.compressionCodecs = factory.getCodec(file); LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { start, split.getLength(), totalLength })); // Open the file and prepare the input stream FileSystem fs = file.getFileSystem(config); FSDataInputStream fileIn = fs.open(file); this.length = split.getLength(); if (start > 0) fileIn.seek(start); if (this.compressionCodecs != null) { // Compressed input // For compressed input NLineInputFormat will have failed to find // any line breaks and will give us a split from 0 -> (length - 1) // Add 1 and re-verify readToEnd so we can abort correctly if ever // given a partial split of a compressed file end++; readToEnd = end == totalLength; if (start > 0 || !readToEnd) throw new IOException( "This record reader can only be used with compressed input where the split is a whole file"); input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn)); } else { // Uncompressed input if (readToEnd) { input = new TrackedInputStream(fileIn); } else { // Need to limit the portion of the file we are reading input = new BlockInputStream(fileIn, split.getLength()); } } // Set up background thread for parser iter = this.getPipedIterator(); this.stream = this.getPipedStream(iter, this.input); RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file); Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder); this.parserThread = new Thread(parserRunnable); this.parserThread.setDaemon(true); this.parserThread.start(); }
From source file:org.apache.jena.hadoop.rdf.io.input.readers.AbstractWholeFileNodeTupleReader.java
License:Apache License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { LOG.debug("initialize({}, {})", genericSplit, context); // Assuming file split if (!(genericSplit instanceof FileSplit)) throw new IOException("This record reader only supports FileSplit inputs"); FileSplit split = (FileSplit) genericSplit; // Configuration Configuration config = context.getConfiguration(); this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true); if (this.ignoreBadTuples) LOG.warn(/* w w w .j a v a 2s. c om*/ "Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown. Consider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES); // Figure out what portion of the file to read if (split.getStart() > 0) throw new IOException("This record reader requires a file split which covers the entire file"); final Path file = split.getPath(); long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen(); CompressionCodecFactory factory = new CompressionCodecFactory(config); this.compressionCodecs = factory.getCodec(file); LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { split.getStart(), split.getLength(), totalLength })); if (totalLength > split.getLength()) throw new IOException("This record reader requires a file split which covers the entire file"); // Open the file and prepare the input stream FileSystem fs = file.getFileSystem(config); FSDataInputStream fileIn = fs.open(file); this.length = split.getLength(); if (this.compressionCodecs != null) { // Compressed input input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn)); } else { // Uncompressed input input = new TrackedInputStream(fileIn); } // Set up background thread for parser iter = this.getPipedIterator(); this.stream = this.getPipedStream(iter, this.input); RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file); Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder); this.parserThread = new Thread(parserRunnable); this.parserThread.setDaemon(true); this.parserThread.start(); }
From source file:org.apache.kylin.job.hadoop.cardinality.HiveColumnCardinalityUpdateJob.java
License:Apache License
private static List<String> readLines(Path location, Configuration conf) throws Exception { FileSystem fileSystem = FileSystem.get(location.toUri(), conf); CompressionCodecFactory factory = new CompressionCodecFactory(conf); FileStatus[] items = fileSystem.listStatus(location); if (items == null) return new ArrayList<String>(); List<String> results = new ArrayList<String>(); for (FileStatus item : items) { // ignoring files like _SUCCESS if (item.getPath().getName().startsWith("_")) { continue; }//from w w w .j a v a 2s.c o m CompressionCodec codec = factory.getCodec(item.getPath()); InputStream stream = null; // check if we have a compression codec we need to use if (codec != null) { stream = codec.createInputStream(fileSystem.open(item.getPath())); } else { stream = fileSystem.open(item.getPath()); } StringWriter writer = new StringWriter(); IOUtils.copy(stream, writer, "UTF-8"); String raw = writer.toString(); for (String str : raw.split("\n")) { results.add(str); } } return results; }
From source file:org.apache.lens.lib.query.TestAbstractFileFormatter.java
License:Apache License
/** * Read compressed file.//from w ww . j a v a 2 s . co m * * @param finalPath the final path * @param conf the conf * @param encoding the encoding * @return the list * @throws IOException Signals that an I/O exception has occurred. */ protected List<String> readCompressedFile(Path finalPath, Configuration conf, String encoding) throws IOException { CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf); compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = compressionCodecs.getCodec(finalPath); FileSystem fs = finalPath.getFileSystem(conf); return readFromStream(new InputStreamReader(codec.createInputStream(fs.open(finalPath)), encoding)); }
From source file:org.apache.nifi.processors.hadoop.FetchHDFS.java
License:Apache License
@Override public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException { FlowFile flowFile = session.get();//from ww w . j ava 2 s . co m if (flowFile == null) { return; } final FileSystem hdfs = getFileSystem(); final UserGroupInformation ugi = getUserGroupInformation(); final String filenameValue = context.getProperty(FILENAME).evaluateAttributeExpressions(flowFile) .getValue(); final Path path; try { path = new Path(filenameValue); } catch (IllegalArgumentException e) { getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure", new Object[] { filenameValue, flowFile, e }); flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage()); flowFile = session.penalize(flowFile); session.transfer(flowFile, REL_FAILURE); return; } final StopWatch stopWatch = new StopWatch(true); final FlowFile finalFlowFile = flowFile; ugi.doAs(new PrivilegedAction<Object>() { @Override public Object run() { InputStream stream = null; CompressionCodec codec = null; Configuration conf = getConfiguration(); final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); final CompressionType compressionType = CompressionType .valueOf(context.getProperty(COMPRESSION_CODEC).toString()); final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC; if (inferCompressionCodec) { codec = compressionCodecFactory.getCodec(path); } else if (compressionType != CompressionType.NONE) { codec = getCompressionCodec(context, getConfiguration()); } FlowFile flowFile = finalFlowFile; final Path qualifiedPath = path.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory()); try { final String outputFilename; final String originalFilename = path.getName(); stream = hdfs.open(path, 16384); // Check if compression codec is defined (inferred or otherwise) if (codec != null) { stream = codec.createInputStream(stream); outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension()); } else { outputFilename = originalFilename; } flowFile = session.importFrom(stream, finalFlowFile); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename); stopWatch.stop(); getLogger().info("Successfully received content from {} for {} in {}", new Object[] { qualifiedPath, flowFile, stopWatch.getDuration() }); session.getProvenanceReporter().fetch(flowFile, qualifiedPath.toString(), stopWatch.getDuration(TimeUnit.MILLISECONDS)); session.transfer(flowFile, REL_SUCCESS); } catch (final FileNotFoundException | AccessControlException e) { getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure", new Object[] { qualifiedPath, flowFile, e }); flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage()); flowFile = session.penalize(flowFile); session.transfer(flowFile, REL_FAILURE); } catch (final IOException e) { getLogger().error( "Failed to retrieve content from {} for {} due to {}; routing to comms.failure", new Object[] { qualifiedPath, flowFile, e }); flowFile = session.penalize(flowFile); session.transfer(flowFile, REL_COMMS_FAILURE); } finally { IOUtils.closeQuietly(stream); } return null; } }); }
From source file:org.apache.nifi.processors.hadoop.GetHDFS.java
License:Apache License
protected void processBatchOfFiles(final List<Path> files, final ProcessContext context, final ProcessSession session) { // process the batch of files InputStream stream = null;// ww w . j av a2 s. c om CompressionCodec codec = null; Configuration conf = getConfiguration(); FileSystem hdfs = getFileSystem(); final boolean keepSourceFiles = context.getProperty(KEEP_SOURCE_FILE).asBoolean(); final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B); int bufferSize = bufferSizeProp != null ? bufferSizeProp.intValue() : conf.getInt(BUFFER_SIZE_KEY, BUFFER_SIZE_DEFAULT); final Path rootDir = new Path(context.getProperty(DIRECTORY).evaluateAttributeExpressions().getValue()); final CompressionType compressionType = CompressionType .valueOf(context.getProperty(COMPRESSION_CODEC).toString()); final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC; if (inferCompressionCodec || compressionType != CompressionType.NONE) { codec = getCompressionCodec(context, getConfiguration()); } final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf); for (final Path file : files) { try { if (!getUserGroupInformation().doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.exists(file))) { continue; // if file is no longer there then move on } final String originalFilename = file.getName(); final String relativePath = getPathDifference(rootDir, file); stream = getUserGroupInformation() .doAs((PrivilegedExceptionAction<FSDataInputStream>) () -> hdfs.open(file, bufferSize)); final String outputFilename; // Check if we should infer compression codec if (inferCompressionCodec) { codec = compressionCodecFactory.getCodec(file); } // Check if compression codec is defined (inferred or otherwise) if (codec != null) { stream = codec.createInputStream(stream); outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension()); } else { outputFilename = originalFilename; } FlowFile flowFile = session.create(); final StopWatch stopWatch = new StopWatch(true); flowFile = session.importFrom(stream, flowFile); stopWatch.stop(); final String dataRate = stopWatch.calculateDataRate(flowFile.getSize()); final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS); flowFile = session.putAttribute(flowFile, CoreAttributes.PATH.key(), relativePath); flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename); if (!keepSourceFiles && !getUserGroupInformation() .doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.delete(file, false))) { getLogger().warn("Could not remove {} from HDFS. Not ingesting this file ...", new Object[] { file }); session.remove(flowFile); continue; } session.getProvenanceReporter().receive(flowFile, file.toString()); session.transfer(flowFile, REL_SUCCESS); getLogger().info("retrieved {} from HDFS {} in {} milliseconds at a rate of {}", new Object[] { flowFile, file, millis, dataRate }); session.commit(); } catch (final Throwable t) { getLogger().error("Error retrieving file {} from HDFS due to {}", new Object[] { file, t }); session.rollback(); context.yield(); } finally { IOUtils.closeQuietly(stream); stream = null; } } }
From source file:org.apache.tajo.engine.query.TestInsertQuery.java
License:Apache License
@Test public final void testInsertOverwriteWithCompression() throws Exception { String tableName = CatalogUtil.normalizeIdentifier("testInsertOverwriteWithCompression"); ResultSet res = executeFile("testInsertOverwriteWithCompression_ddl.sql"); res.close();/* www. java2 s. com*/ CatalogService catalog = testingCluster.getMaster().getCatalog(); assertTrue(catalog.existsTable(getCurrentDatabase(), tableName)); res = executeQuery(); res.close(); TableDesc desc = catalog.getTableDesc(getCurrentDatabase(), tableName); if (!testingCluster.isHCatalogStoreRunning()) { assertEquals(2, desc.getStats().getNumRows().intValue()); } FileSystem fs = FileSystem.get(testingCluster.getConfiguration()); assertTrue(fs.exists(new Path(desc.getPath()))); CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration()); for (FileStatus file : fs.listStatus(new Path(desc.getPath()))) { CompressionCodec codec = factory.getCodec(file.getPath()); assertTrue(codec instanceof DeflateCodec); } executeString("DROP TABLE " + tableName + " PURGE"); }