Example usage for org.apache.hadoop.io.compress CompressionCodecFactory getCodec

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress CompressionCodecFactory getCodec.

Prototype

public CompressionCodec getCodec(Path file)

Source Link

Document

Find the relevant compression codec for the given file based on its filename suffix.

Usage

From source file:io.prestosql.plugin.hive.HiveUtil.java

License:Apache License

public static Optional<CompressionCodec> getCompressionCodec(TextInputFormat inputFormat, Path file) {
    CompressionCodecFactory compressionCodecFactory;

    try {//from  w w  w.ja va  2s. c o m
        compressionCodecFactory = (CompressionCodecFactory) COMPRESSION_CODECS_FIELD.get(inputFormat);
    } catch (IllegalAccessException e) {
        throw new PrestoException(GENERIC_INTERNAL_ERROR,
                "Failed to find compressionCodec for inputFormat: " + inputFormat.getClass().getName(), e);
    }

    if (compressionCodecFactory == null) {
        return Optional.empty();
    }

    return Optional.ofNullable(compressionCodecFactory.getCodec(file));
}

From source file:ml.shifu.shifu.util.HdfsGlobalFile.java

License:Apache License

private InputStream openPartFileAsStream(FileStatus fileStatus) throws IOException {
    CompressionCodecFactory compressionFactory = new CompressionCodecFactory(new Configuration());
    InputStream is = null;/*from   ww w  .  j a v  a2 s.  c  om*/

    FileSystem fs = ShifuFileUtils.getFileSystemBySourceType(sourceType);
    CompressionCodec codec = compressionFactory.getCodec(fileStatus.getPath());
    if (codec != null) {
        is = codec.createInputStream(fs.open(fileStatus.getPath()));
    } else {
        is = fs.open(fileStatus.getPath());
    }
    return is;
}

From source file:nyu.cs.webgraph.MRhelpers.LzoTabSeperatedLineRecordReader.java

License:Open Source License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    start = split.getStart();/*w w w.j a v  a 2  s.c  o m*/
    end = start + split.getLength();
    final Path file = split.getPath();
    Configuration job = context.getConfiguration();

    FileSystem fs = file.getFileSystem(job);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    if (codec == null) {
        throw new IOException("Codec for file " + file + " not found, cannot run");
    }

    // open the file and seek to the start of the split
    fileIn = fs.open(split.getPath());

    // creates input stream and also reads the file header
    in = new LineReader(codec.createInputStream(fileIn), job);

    if (start != 0) {
        fileIn.seek(start);

        // read and ignore the first line
        in.readLine(new Text());
        start = fileIn.getPos();
    }

    this.pos = start;
}

From source file:org.apache.jena.hadoop.rdf.io.input.readers.AbstractBlockBasedNodeTupleReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    LOG.debug("initialize({}, {})", genericSplit, context);

    // Assuming file split
    if (!(genericSplit instanceof FileSplit))
        throw new IOException("This record reader only supports FileSplit inputs");
    FileSplit split = (FileSplit) genericSplit;

    // Configuration
    Configuration config = context.getConfiguration();
    this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
    if (this.ignoreBadTuples)
        LOG.warn(/* w w w. ja  v a 2s.  c  om*/
                "Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown.  Consider setting {} to false to disable this behaviour",
                RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);

    // Figure out what portion of the file to read
    start = split.getStart();
    long end = start + split.getLength();
    final Path file = split.getPath();
    long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
    boolean readToEnd = end == totalLength;
    CompressionCodecFactory factory = new CompressionCodecFactory(config);
    this.compressionCodecs = factory.getCodec(file);

    LOG.info(String.format("Got split with start %d and length %d for file with total length of %d",
            new Object[] { start, split.getLength(), totalLength }));

    // Open the file and prepare the input stream
    FileSystem fs = file.getFileSystem(config);
    FSDataInputStream fileIn = fs.open(file);
    this.length = split.getLength();
    if (start > 0)
        fileIn.seek(start);

    if (this.compressionCodecs != null) {
        // Compressed input
        // For compressed input NLineInputFormat will have failed to find
        // any line breaks and will give us a split from 0 -> (length - 1)
        // Add 1 and re-verify readToEnd so we can abort correctly if ever
        // given a partial split of a compressed file
        end++;
        readToEnd = end == totalLength;
        if (start > 0 || !readToEnd)
            throw new IOException(
                    "This record reader can only be used with compressed input where the split is a whole file");
        input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn));
    } else {
        // Uncompressed input

        if (readToEnd) {
            input = new TrackedInputStream(fileIn);
        } else {
            // Need to limit the portion of the file we are reading
            input = new BlockInputStream(fileIn, split.getLength());
        }
    }

    // Set up background thread for parser
    iter = this.getPipedIterator();
    this.stream = this.getPipedStream(iter, this.input);
    RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file);
    Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder);

    this.parserThread = new Thread(parserRunnable);
    this.parserThread.setDaemon(true);
    this.parserThread.start();
}

From source file:org.apache.jena.hadoop.rdf.io.input.readers.AbstractWholeFileNodeTupleReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    LOG.debug("initialize({}, {})", genericSplit, context);

    // Assuming file split
    if (!(genericSplit instanceof FileSplit))
        throw new IOException("This record reader only supports FileSplit inputs");
    FileSplit split = (FileSplit) genericSplit;

    // Configuration
    Configuration config = context.getConfiguration();
    this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
    if (this.ignoreBadTuples)
        LOG.warn(/* w  w  w  .j a  v a  2s. c om*/
                "Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown.  Consider setting {} to false to disable this behaviour",
                RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);

    // Figure out what portion of the file to read
    if (split.getStart() > 0)
        throw new IOException("This record reader requires a file split which covers the entire file");
    final Path file = split.getPath();
    long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
    CompressionCodecFactory factory = new CompressionCodecFactory(config);
    this.compressionCodecs = factory.getCodec(file);

    LOG.info(String.format("Got split with start %d and length %d for file with total length of %d",
            new Object[] { split.getStart(), split.getLength(), totalLength }));

    if (totalLength > split.getLength())
        throw new IOException("This record reader requires a file split which covers the entire file");

    // Open the file and prepare the input stream
    FileSystem fs = file.getFileSystem(config);
    FSDataInputStream fileIn = fs.open(file);
    this.length = split.getLength();
    if (this.compressionCodecs != null) {
        // Compressed input
        input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn));
    } else {
        // Uncompressed input
        input = new TrackedInputStream(fileIn);
    }

    // Set up background thread for parser
    iter = this.getPipedIterator();
    this.stream = this.getPipedStream(iter, this.input);
    RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file);
    Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder);
    this.parserThread = new Thread(parserRunnable);
    this.parserThread.setDaemon(true);
    this.parserThread.start();
}

From source file:org.apache.kylin.job.hadoop.cardinality.HiveColumnCardinalityUpdateJob.java

License:Apache License

private static List<String> readLines(Path location, Configuration conf) throws Exception {
    FileSystem fileSystem = FileSystem.get(location.toUri(), conf);
    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    FileStatus[] items = fileSystem.listStatus(location);
    if (items == null)
        return new ArrayList<String>();
    List<String> results = new ArrayList<String>();
    for (FileStatus item : items) {

        // ignoring files like _SUCCESS
        if (item.getPath().getName().startsWith("_")) {
            continue;
        }//from  w w w  .j a  v a  2s.c  o m

        CompressionCodec codec = factory.getCodec(item.getPath());
        InputStream stream = null;

        // check if we have a compression codec we need to use
        if (codec != null) {
            stream = codec.createInputStream(fileSystem.open(item.getPath()));
        } else {
            stream = fileSystem.open(item.getPath());
        }

        StringWriter writer = new StringWriter();
        IOUtils.copy(stream, writer, "UTF-8");
        String raw = writer.toString();
        for (String str : raw.split("\n")) {
            results.add(str);
        }
    }
    return results;
}

From source file:org.apache.lens.lib.query.TestAbstractFileFormatter.java

License:Apache License

/**
 * Read compressed file.//from w ww  .  j a v  a  2 s  . co m
 *
 * @param finalPath the final path
 * @param conf      the conf
 * @param encoding  the encoding
 * @return the list
 * @throws IOException Signals that an I/O exception has occurred.
 */
protected List<String> readCompressedFile(Path finalPath, Configuration conf, String encoding)
        throws IOException {
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = compressionCodecs.getCodec(finalPath);
    FileSystem fs = finalPath.getFileSystem(conf);
    return readFromStream(new InputStreamReader(codec.createInputStream(fs.open(finalPath)), encoding));
}

From source file:org.apache.nifi.processors.hadoop.FetchHDFS.java

License:Apache License

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();//from   ww w . j ava 2 s  . co m
    if (flowFile == null) {
        return;
    }

    final FileSystem hdfs = getFileSystem();
    final UserGroupInformation ugi = getUserGroupInformation();
    final String filenameValue = context.getProperty(FILENAME).evaluateAttributeExpressions(flowFile)
            .getValue();

    final Path path;
    try {
        path = new Path(filenameValue);
    } catch (IllegalArgumentException e) {
        getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure",
                new Object[] { filenameValue, flowFile, e });
        flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage());
        flowFile = session.penalize(flowFile);
        session.transfer(flowFile, REL_FAILURE);
        return;
    }

    final StopWatch stopWatch = new StopWatch(true);
    final FlowFile finalFlowFile = flowFile;

    ugi.doAs(new PrivilegedAction<Object>() {
        @Override
        public Object run() {
            InputStream stream = null;
            CompressionCodec codec = null;
            Configuration conf = getConfiguration();
            final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
            final CompressionType compressionType = CompressionType
                    .valueOf(context.getProperty(COMPRESSION_CODEC).toString());
            final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC;

            if (inferCompressionCodec) {
                codec = compressionCodecFactory.getCodec(path);
            } else if (compressionType != CompressionType.NONE) {
                codec = getCompressionCodec(context, getConfiguration());
            }

            FlowFile flowFile = finalFlowFile;
            final Path qualifiedPath = path.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory());
            try {
                final String outputFilename;
                final String originalFilename = path.getName();
                stream = hdfs.open(path, 16384);

                // Check if compression codec is defined (inferred or otherwise)
                if (codec != null) {
                    stream = codec.createInputStream(stream);
                    outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension());
                } else {
                    outputFilename = originalFilename;
                }

                flowFile = session.importFrom(stream, finalFlowFile);
                flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename);

                stopWatch.stop();
                getLogger().info("Successfully received content from {} for {} in {}",
                        new Object[] { qualifiedPath, flowFile, stopWatch.getDuration() });
                session.getProvenanceReporter().fetch(flowFile, qualifiedPath.toString(),
                        stopWatch.getDuration(TimeUnit.MILLISECONDS));
                session.transfer(flowFile, REL_SUCCESS);
            } catch (final FileNotFoundException | AccessControlException e) {
                getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure",
                        new Object[] { qualifiedPath, flowFile, e });
                flowFile = session.putAttribute(flowFile, "hdfs.failure.reason", e.getMessage());
                flowFile = session.penalize(flowFile);
                session.transfer(flowFile, REL_FAILURE);
            } catch (final IOException e) {
                getLogger().error(
                        "Failed to retrieve content from {} for {} due to {}; routing to comms.failure",
                        new Object[] { qualifiedPath, flowFile, e });
                flowFile = session.penalize(flowFile);
                session.transfer(flowFile, REL_COMMS_FAILURE);
            } finally {
                IOUtils.closeQuietly(stream);
            }

            return null;
        }
    });

}

From source file:org.apache.nifi.processors.hadoop.GetHDFS.java

License:Apache License

protected void processBatchOfFiles(final List<Path> files, final ProcessContext context,
        final ProcessSession session) {
    // process the batch of files
    InputStream stream = null;//  ww  w  .  j  av a2  s. c om
    CompressionCodec codec = null;
    Configuration conf = getConfiguration();
    FileSystem hdfs = getFileSystem();
    final boolean keepSourceFiles = context.getProperty(KEEP_SOURCE_FILE).asBoolean();
    final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B);
    int bufferSize = bufferSizeProp != null ? bufferSizeProp.intValue()
            : conf.getInt(BUFFER_SIZE_KEY, BUFFER_SIZE_DEFAULT);
    final Path rootDir = new Path(context.getProperty(DIRECTORY).evaluateAttributeExpressions().getValue());

    final CompressionType compressionType = CompressionType
            .valueOf(context.getProperty(COMPRESSION_CODEC).toString());
    final boolean inferCompressionCodec = compressionType == CompressionType.AUTOMATIC;
    if (inferCompressionCodec || compressionType != CompressionType.NONE) {
        codec = getCompressionCodec(context, getConfiguration());
    }
    final CompressionCodecFactory compressionCodecFactory = new CompressionCodecFactory(conf);
    for (final Path file : files) {
        try {
            if (!getUserGroupInformation().doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.exists(file))) {
                continue; // if file is no longer there then move on
            }
            final String originalFilename = file.getName();
            final String relativePath = getPathDifference(rootDir, file);

            stream = getUserGroupInformation()
                    .doAs((PrivilegedExceptionAction<FSDataInputStream>) () -> hdfs.open(file, bufferSize));

            final String outputFilename;
            // Check if we should infer compression codec
            if (inferCompressionCodec) {
                codec = compressionCodecFactory.getCodec(file);
            }
            // Check if compression codec is defined (inferred or otherwise)
            if (codec != null) {
                stream = codec.createInputStream(stream);
                outputFilename = StringUtils.removeEnd(originalFilename, codec.getDefaultExtension());
            } else {
                outputFilename = originalFilename;
            }

            FlowFile flowFile = session.create();

            final StopWatch stopWatch = new StopWatch(true);
            flowFile = session.importFrom(stream, flowFile);
            stopWatch.stop();
            final String dataRate = stopWatch.calculateDataRate(flowFile.getSize());
            final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);

            flowFile = session.putAttribute(flowFile, CoreAttributes.PATH.key(), relativePath);
            flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), outputFilename);

            if (!keepSourceFiles && !getUserGroupInformation()
                    .doAs((PrivilegedExceptionAction<Boolean>) () -> hdfs.delete(file, false))) {
                getLogger().warn("Could not remove {} from HDFS. Not ingesting this file ...",
                        new Object[] { file });
                session.remove(flowFile);
                continue;
            }

            session.getProvenanceReporter().receive(flowFile, file.toString());
            session.transfer(flowFile, REL_SUCCESS);
            getLogger().info("retrieved {} from HDFS {} in {} milliseconds at a rate of {}",
                    new Object[] { flowFile, file, millis, dataRate });
            session.commit();
        } catch (final Throwable t) {
            getLogger().error("Error retrieving file {} from HDFS due to {}", new Object[] { file, t });
            session.rollback();
            context.yield();
        } finally {
            IOUtils.closeQuietly(stream);
            stream = null;
        }
    }
}

From source file:org.apache.tajo.engine.query.TestInsertQuery.java

License:Apache License

@Test
public final void testInsertOverwriteWithCompression() throws Exception {
    String tableName = CatalogUtil.normalizeIdentifier("testInsertOverwriteWithCompression");
    ResultSet res = executeFile("testInsertOverwriteWithCompression_ddl.sql");
    res.close();/*  www.  java2 s.  com*/

    CatalogService catalog = testingCluster.getMaster().getCatalog();
    assertTrue(catalog.existsTable(getCurrentDatabase(), tableName));

    res = executeQuery();
    res.close();
    TableDesc desc = catalog.getTableDesc(getCurrentDatabase(), tableName);
    if (!testingCluster.isHCatalogStoreRunning()) {
        assertEquals(2, desc.getStats().getNumRows().intValue());
    }

    FileSystem fs = FileSystem.get(testingCluster.getConfiguration());
    assertTrue(fs.exists(new Path(desc.getPath())));
    CompressionCodecFactory factory = new CompressionCodecFactory(testingCluster.getConfiguration());

    for (FileStatus file : fs.listStatus(new Path(desc.getPath()))) {
        CompressionCodec codec = factory.getCodec(file.getPath());
        assertTrue(codec instanceof DeflateCodec);
    }
    executeString("DROP TABLE " + tableName + " PURGE");
}