com.metamx.common.CompressionUtils.java Source code

Introduction

Here is the source code for com.metamx.common.CompressionUtils.java
Source

/*
 * Copyright 2015 Metamarkets Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.metamx.common;

import com.google.common.base.Predicate;
import com.google.common.base.Strings;
import com.google.common.base.Throwables;
import com.google.common.io.ByteSink;
import com.google.common.io.ByteSource;
import com.google.common.io.ByteStreams;
import com.google.common.io.Files;
import com.metamx.common.guava.CloseQuietly;
import com.metamx.common.logger.Logger;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Enumeration;
import java.util.concurrent.Callable;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;

public class CompressionUtils {
    private static final Logger log = new Logger(CompressionUtils.class);
    private static final int DEFAULT_RETRY_COUNT = 3;

    public static final String GZ_SUFFIX = ".gz";
    public static final String ZIP_SUFFIX = ".zip";

    /**
     * Zip the contents of directory into the file indicated by outputZipFile. Sub directories are skipped
     *
     * @param directory     The directory whose contents should be added to the zip in the output stream.
     * @param outputZipFile The output file to write the zipped data to
     *
     * @return The number of bytes (uncompressed) read from the input directory.
     *
     * @throws IOException
     */
    public static long zip(File directory, File outputZipFile) throws IOException {
        if (!isZip(outputZipFile.getName())) {
            log.warn("No .zip suffix[%s], putting files from [%s] into it anyway.", outputZipFile, directory);
        }

        try (final FileOutputStream out = new FileOutputStream(outputZipFile)) {
            return zip(directory, out);
        }
    }

    /**
     * Zips the contents of the input directory to the output stream. Sub directories are skipped
     *
     * @param directory The directory whose contents should be added to the zip in the output stream.
     * @param out       The output stream to write the zip data to.
     *
     * @return The number of bytes (uncompressed) read from the input directory.
     *
     * @throws IOException
     */
    public static long zip(File directory, OutputStream out) throws IOException {
        if (!directory.isDirectory()) {
            throw new IOException(String.format("directory[%s] is not a directory", directory));
        }
        final File[] files = directory.listFiles();

        long totalSize = 0;
        try (final ZipOutputStream zipOut = new ZipOutputStream(out)) {
            for (File file : files) {
                log.info("Adding file[%s] with size[%,d].  Total size so far[%,d]", file, file.length(), totalSize);
                if (file.length() >= Integer.MAX_VALUE) {
                    zipOut.finish();
                    throw new IOException(String.format("file[%s] too large [%,d]", file, file.length()));
                }
                zipOut.putNextEntry(new ZipEntry(file.getName()));
                totalSize += Files.asByteSource(file).copyTo(zipOut);
            }
            zipOut.closeEntry();
        }

        return totalSize;
    }

    /**
     * Unzip the byteSource to the output directory. If cacheLocally is true, the byteSource is cached to local disk before unzipping.
     * This may cause more predictable behavior than trying to unzip a large file directly off a network stream, for example.
     * * @param byteSource The ByteSource which supplies the zip data
     *
     * @param byteSource   The ByteSource which supplies the zip data
     * @param outDir       The output directory to put the contents of the zip
     * @param shouldRetry  A predicate expression to determine if a new InputStream should be acquired from ByteSource and the copy attempted again
     * @param cacheLocally A boolean flag to indicate if the data should be cached locally
     *
     * @return A FileCopyResult containing the result of writing the zip entries to disk
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult unzip(final ByteSource byteSource, final File outDir,
            final Predicate<Throwable> shouldRetry, boolean cacheLocally) throws IOException {
        if (!cacheLocally) {
            try {
                return RetryUtils.retry(new Callable<FileUtils.FileCopyResult>() {
                    @Override
                    public FileUtils.FileCopyResult call() throws Exception {
                        return unzip(byteSource.openStream(), outDir);
                    }
                }, shouldRetry, DEFAULT_RETRY_COUNT);
            } catch (Exception e) {
                throw Throwables.propagate(e);
            }
        } else {
            final File tmpFile = File.createTempFile("compressionUtilZipCache", ZIP_SUFFIX);
            try {
                FileUtils.FileCopyResult copyResult = FileUtils.retryCopy(byteSource, tmpFile, shouldRetry,
                        DEFAULT_RETRY_COUNT);
                return unzip(tmpFile, outDir);
            } finally {
                if (!tmpFile.delete()) {
                    log.warn("Could not delete zip cache at [%s]", tmpFile.toString());
                }
            }
        }
    }

    /**
     * Unzip the byteSource to the output directory. If cacheLocally is true, the byteSource is cached to local disk before unzipping.
     * This may cause more predictable behavior than trying to unzip a large file directly off a network stream, for example.
     *
     * @param byteSource   The ByteSource which supplies the zip data
     * @param outDir       The output directory to put the contents of the zip
     * @param cacheLocally A boolean flag to indicate if the data should be cached locally
     *
     * @return A FileCopyResult containing the result of writing the zip entries to disk
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult unzip(final ByteSource byteSource, final File outDir,
            boolean cacheLocally) throws IOException {
        return unzip(byteSource, outDir, FileUtils.IS_EXCEPTION, cacheLocally);
    }

    /**
     * Unzip the pulled file to an output directory. This is only expected to work on zips with lone files, and is not intended for zips with directory structures.
     *
     * @param pulledFile The file to unzip
     * @param outDir     The directory to store the contents of the file.
     *
     * @return a FileCopyResult of the files which were written to disk
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult unzip(final File pulledFile, final File outDir) throws IOException {
        if (!(outDir.exists() && outDir.isDirectory())) {
            throw new ISE("outDir[%s] must exist and be a directory", outDir);
        }
        log.info("Unzipping file[%s] to [%s]", pulledFile, outDir);
        final FileUtils.FileCopyResult result = new FileUtils.FileCopyResult();
        try (final ZipFile zipFile = new ZipFile(pulledFile)) {
            final Enumeration<? extends ZipEntry> enumeration = zipFile.entries();
            while (enumeration.hasMoreElements()) {
                final ZipEntry entry = enumeration.nextElement();
                result.addFiles(FileUtils.retryCopy(new ByteSource() {
                    @Override
                    public InputStream openStream() throws IOException {
                        return new BufferedInputStream(zipFile.getInputStream(entry));
                    }
                }, new File(outDir, entry.getName()), FileUtils.IS_EXCEPTION, DEFAULT_RETRY_COUNT).getFiles());
            }
        }
        return result;
    }

    /**
     * Unzip from the input stream to the output directory, using the entry's file name as the file name in the output directory.
     * The behavior of directories in the input stream's zip is undefined.
     * If possible, it is recommended to use unzip(ByteStream, File) instead
     *
     * @param in     The input stream of the zip data
     * @param outDir The directory to copy the unzipped data to
     *
     * @return The FileUtils.FileCopyResult containing information on all the files which were written
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult unzip(InputStream in, File outDir) throws IOException {
        try (final ZipInputStream zipIn = new ZipInputStream(in)) {
            final FileUtils.FileCopyResult result = new FileUtils.FileCopyResult();
            ZipEntry entry;
            while ((entry = zipIn.getNextEntry()) != null) {
                final File file = new File(outDir, entry.getName());
                Files.asByteSink(file).writeFrom(zipIn);
                result.addFile(file);
                zipIn.closeEntry();
            }
            return result;
        }
    }

    /**
     * gunzip the file to the output file.
     *
     * @param pulledFile The source of the gz data
     * @param outFile    A target file to put the contents
     *
     * @return The result of the file copy
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult gunzip(final File pulledFile, File outFile) throws IOException {
        return gunzip(Files.asByteSource(pulledFile), outFile);
    }

    /**
     * Unzips the input stream via a gzip filter. use gunzip(ByteSource, File, Predicate) if possible
     *
     * @param in      The input stream to run through the gunzip filter. This stream is closed
     * @param outFile The file to output to
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult gunzip(InputStream in, File outFile) throws IOException {
        try (GZIPInputStream gzipInputStream = gzipInputStream(in)) {
            Files.asByteSink(outFile).writeFrom(gzipInputStream);
            return new FileUtils.FileCopyResult(outFile);
        }
    }

    /**
     * Fixes java bug 7036144 http://bugs.java.com/bugdatabase/view_bug.do?bug_id=7036144 which affects concatenated GZip
     *
     * @param in The raw input stream
     *
     * @return A GZIPInputStream that can handle concatenated gzip streams in the input
     */
    public static GZIPInputStream gzipInputStream(final InputStream in) throws IOException {
        return new GZIPInputStream(new FilterInputStream(in) {
            @Override
            public int available() {
                // Hack. Docs say available() should return an estimate, so we estimate about 1KB to work around available == 0 bug in GZIPInputStream
                return 1 << 10;
            }
        });
    }

    /**
     * gunzip from the source stream to the destination stream.
     *
     * @param in  The input stream which is to be decompressed
     * @param out The output stream to write to
     *
     * @return The number of bytes written to the output stream.
     *
     * @throws IOException
     */
    public static long gunzip(InputStream in, OutputStream out) throws IOException {
        try (GZIPInputStream gzipInputStream = gzipInputStream(in)) {
            return ByteStreams.copy(gzipInputStream, out);
        } finally {
            out.close();
        }
    }

    /**
     * A gunzip function to store locally
     *
     * @param in          The factory to produce input streams
     * @param outFile     The file to store the result into
     * @param shouldRetry A predicate to indicate if the Throwable is recoverable
     *
     * @return The count of bytes written to outFile
     */
    public static FileUtils.FileCopyResult gunzip(final ByteSource in, final File outFile,
            Predicate<Throwable> shouldRetry) {
        return FileUtils.retryCopy(new ByteSource() {
            @Override
            public InputStream openStream() throws IOException {
                return gzipInputStream(in.openStream());
            }
        }, outFile, shouldRetry, DEFAULT_RETRY_COUNT);
    }

    /**
     * Gunzip from the input stream to the output file
     *
     * @param in      The compressed input stream to read from
     * @param outFile The file to write the uncompressed results to
     *
     * @return A FileCopyResult of the file written
     */
    public static FileUtils.FileCopyResult gunzip(final ByteSource in, File outFile) {
        return gunzip(in, outFile, FileUtils.IS_EXCEPTION);
    }

    /**
     * Copy inputStream to out while wrapping out in a GZIPOutputStream
     * Closes both input and output
     *
     * @param inputStream The input stream to copy data from
     * @param out         The output stream to wrap in a GZIPOutputStream beore copying
     *
     * @return The size of the data copied
     *
     * @throws IOException
     */
    public static long gzip(InputStream inputStream, OutputStream out) throws IOException {
        try (GZIPOutputStream outputStream = new GZIPOutputStream(out)) {
            return ByteStreams.copy(inputStream, outputStream);
        } finally {
            CloseQuietly.close(out);
            CloseQuietly.close(inputStream);
        }
    }

    /**
     * Gzips the input file to the output
     *
     * @param inFile      The file to gzip
     * @param outFile     A target file to copy the uncompressed contents of inFile to
     * @param shouldRetry Predicate on a potential throwable to determine if the copy should be attempted again.
     *
     * @return The result of the file copy
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult gzip(final File inFile, final File outFile,
            Predicate<Throwable> shouldRetry) throws IOException {
        gzip(Files.asByteSource(inFile), Files.asByteSink(outFile), shouldRetry);
        return new FileUtils.FileCopyResult(outFile);
    }

    public static long gzip(final ByteSource in, final ByteSink out, Predicate<Throwable> shouldRetry)
            throws IOException {
        return StreamUtils.retryCopy(in, new ByteSink() {
            @Override
            public OutputStream openStream() throws IOException {
                return new GZIPOutputStream(out.openStream());
            }
        }, shouldRetry, DEFAULT_RETRY_COUNT);
    }

    /**
     * GZip compress the contents of inFile into outFile
     *
     * @param inFile  The source of data
     * @param outFile The destination for compressed data
     *
     * @return A FileCopyResult of the resulting file at outFile
     *
     * @throws IOException
     */
    public static FileUtils.FileCopyResult gzip(final File inFile, final File outFile) throws IOException {
        return gzip(inFile, outFile, FileUtils.IS_EXCEPTION);
    }

    /**
     * Checks to see if fName is a valid name for a "*.zip" file
     *
     * @param fName The name of the file in question
     *
     * @return True if fName is properly named for a .zip file, false otherwise
     */
    public static boolean isZip(String fName) {
        if (Strings.isNullOrEmpty(fName)) {
            return false;
        }
        return fName.endsWith(ZIP_SUFFIX); // Technically a file named `.zip` would be fine
    }

    /**
     * Checks to see if fName is a valid name for a "*.gz" file
     *
     * @param fName The name of the file in question
     *
     * @return True if fName is a properly named .gz file, false otherwise
     */
    public static boolean isGz(String fName) {
        if (Strings.isNullOrEmpty(fName)) {
            return false;
        }
        return fName.endsWith(GZ_SUFFIX) && fName.length() > GZ_SUFFIX.length();
    }

    /**
     * Get the file name without the .gz extension
     *
     * @param fname The name of the gzip file
     *
     * @return fname without the ".gz" extension
     *
     * @throws com.metamx.common.IAE if fname is not a valid "*.gz" file name
     */
    public static String getGzBaseName(String fname) {
        final String reducedFname = Files.getNameWithoutExtension(fname);
        if (isGz(fname) && !reducedFname.isEmpty()) {
            return reducedFname;
        }
        throw new IAE("[%s] is not a valid gz file name", fname);
    }
}