Example usage for org.apache.hadoop.io.compress SnappyCodec SnappyCodec

List of usage examples for org.apache.hadoop.io.compress SnappyCodec SnappyCodec

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress SnappyCodec SnappyCodec.

Prototype

SnappyCodec

Source Link

Usage

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.AbstractHoplog.java

License:Apache License

private static Option withCompression(Logger logger) {
    String prop = System.getProperty(HoplogConfig.COMPRESSION);
    if (prop != null) {
        CompressionCodec codec;/* ww w.  j  ava 2  s. co m*/
        if (prop.equalsIgnoreCase("SNAPPY")) {
            codec = new SnappyCodec();
        } else if (prop.equalsIgnoreCase("LZ4")) {
            codec = new Lz4Codec();
        } else if (prop.equals("GZ")) {
            codec = new GzipCodec();
        } else {
            throw new IllegalStateException("Unsupported codec: " + prop);
        }
        if (logger.isDebugEnabled())
            logger.debug("{}Using compression codec " + codec, logPrefix);
        return SequenceFile.Writer.compression(CompressionType.BLOCK, codec);
    }
    return SequenceFile.Writer.compression(CompressionType.NONE, null);
}

From source file:org.apache.carbondata.hadoop.csv.CSVInputFormatTest.java

License:Apache License

/**
 * generate compressed files, no need to call this method.
 * @throws Exception//from  w ww .j a va 2s  .c  o  m
 */
public void generateCompressFiles() throws Exception {
    String pwd = new File("src/test/resources").getCanonicalPath();
    String inputFile = pwd + "/data.csv";
    FileInputStream input = new FileInputStream(inputFile);
    Configuration conf = new Configuration();

    // .gz
    String outputFile = pwd + "/data.csv.gz";
    FileOutputStream output = new FileOutputStream(outputFile);
    GzipCodec gzip = new GzipCodec();
    gzip.setConf(conf);
    CompressionOutputStream outputStream = gzip.createOutputStream(output);
    int i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

    // .bz2
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.bz2";
    output = new FileOutputStream(outputFile);
    BZip2Codec bzip2 = new BZip2Codec();
    bzip2.setConf(conf);
    outputStream = bzip2.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

    // .snappy
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.snappy";
    output = new FileOutputStream(outputFile);
    SnappyCodec snappy = new SnappyCodec();
    snappy.setConf(conf);
    outputStream = snappy.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

    //.lz4
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.lz4";
    output = new FileOutputStream(outputFile);
    Lz4Codec lz4 = new Lz4Codec();
    lz4.setConf(conf);
    outputStream = lz4.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

}

From source file:org.apache.carbondata.processing.csvload.CSVInputFormatTest.java

License:Apache License

/**
 * generate compressed files, no need to call this method.
 * @throws Exception// w  ww  .  j  a  v  a  2s.c o m
 */
public void generateCompressFiles() throws Exception {
    String pwd = new File("src/test/resources/csv").getCanonicalPath();
    String inputFile = pwd + "/data.csv";
    FileInputStream input = new FileInputStream(inputFile);
    Configuration conf = new Configuration();

    // .gz
    String outputFile = pwd + "/data.csv.gz";
    FileOutputStream output = new FileOutputStream(outputFile);
    GzipCodec gzip = new GzipCodec();
    gzip.setConf(conf);
    CompressionOutputStream outputStream = gzip.createOutputStream(output);
    int i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

    // .bz2
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.bz2";
    output = new FileOutputStream(outputFile);
    BZip2Codec bzip2 = new BZip2Codec();
    bzip2.setConf(conf);
    outputStream = bzip2.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

    // .snappy
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.snappy";
    output = new FileOutputStream(outputFile);
    SnappyCodec snappy = new SnappyCodec();
    snappy.setConf(conf);
    outputStream = snappy.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

    //.lz4
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.lz4";
    output = new FileOutputStream(outputFile);
    Lz4Codec lz4 = new Lz4Codec();
    lz4.setConf(conf);
    outputStream = lz4.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

}

From source file:org.apache.ignite.internal.processors.hadoop.HadoopSnappyTest.java

License:Apache License

/**
 * Internal check routine./*w w w.  j a v a2  s . c om*/
 *
 * @throws Throwable If failed.
 */
public static void checkSnappy() throws Throwable {
    try {
        byte[] expBytes = new byte[BYTE_SIZE];
        byte[] actualBytes = new byte[BYTE_SIZE];

        for (int i = 0; i < expBytes.length; i++)
            expBytes[i] = (byte) ThreadLocalRandom.current().nextInt(16);

        SnappyCodec codec = new SnappyCodec();

        codec.setConf(new Configuration());

        ByteArrayOutputStream baos = new ByteArrayOutputStream();

        try (CompressionOutputStream cos = codec.createOutputStream(baos)) {
            cos.write(expBytes);
            cos.flush();
        }

        try (CompressionInputStream cis = codec
                .createInputStream(new ByteArrayInputStream(baos.toByteArray()))) {
            int read = cis.read(actualBytes, 0, actualBytes.length);

            assert read == actualBytes.length;
        }

        assert Arrays.equals(expBytes, actualBytes);
    } catch (Throwable e) {
        System.out.println("Snappy check failed:");
        System.out
                .println("### NativeCodeLoader.isNativeCodeLoaded:  " + NativeCodeLoader.isNativeCodeLoaded());
        System.out
                .println("### SnappyCompressor.isNativeCodeLoaded:  " + SnappyCompressor.isNativeCodeLoaded());

        throw e;
    }
}

From source file:org.commoncrawl.util.CrawlLogSplitter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*"));
    for (FileStatus candidate : arcFiles) {
        if (candidate.getLen() > SPLIT_SIZE) {
            candidateList.add(candidate.getPath());
        }/*from  w w w.j  a  v  a  2  s  . c om*/
    }

    LOG.info("Found:" + candidateList.size() + " oversized candidates");

    Path tempOutputDir = new Path(conf.get("mapred.temp.dir", "."));

    while (candidateList.size() != 0) {
        Path candidateName = candidateList.first();
        candidateList.remove(candidateName);

        LOG.info("Processing Candidate:" + candidateName);
        long fileSize = fs.getFileStatus(candidateName).getLen();
        //get crawl log filename components

        ArrayList<Path> splitItems = new ArrayList<Path>();

        int index = 0;

        Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index);

        LOG.info("Initial Output Path is:" + outputPart);

        fs.delete(outputPart, false);

        // create reader 
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf);
        ValueBytes sourceVB = reader.createValueBytes();
        DataOutputBuffer sourceKeyData = new DataOutputBuffer();

        try {
            // ok create temp file 
            SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());

            // add to split items array 
            splitItems.add(outputPart);

            try {
                long recordsWritten = 0;
                while (reader.nextRawKey(sourceKeyData) != -1) {
                    reader.nextRawValue(sourceVB);
                    long lengthPreWrite = activeWriter.getLength();
                    activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB);
                    if (++recordsWritten % 10000 == 0) {
                        LOG.info("Write 10000 records");
                    }
                    long lengthPostWrite = activeWriter.getLength();
                    if (lengthPostWrite != lengthPreWrite) {
                        if (lengthPostWrite >= IDEAL_SIZE) {
                            LOG.info("Hit Split Point. Flushing File:" + outputPart);
                            activeWriter.close();
                            outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir,
                                    candidateName.getName(), ++index);
                            LOG.info("Creating New File:" + outputPart);
                            activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());
                            splitItems.add(outputPart);
                        }
                    }
                    sourceKeyData.reset();
                }
            } finally {
                activeWriter.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files");
        for (Path splitItem : splitItems) {
            Path destPath = new Path("crawl/checkpoint_data", splitItem.getName());
            LOG.info("Moving:" + splitItem + " to:" + destPath);
            fs.rename(splitItem, destPath);
        }
        Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName());
        LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation);
        fs.rename(candidateName, sourceMoveLocation);
    }
}

From source file:org.pentaho.hadoop.shim.common.CommonSnappyShim.java

License:Apache License

/**
 * Gets an InputStream that uses the snappy codec and wraps the supplied base input stream.
 *
 * @param the buffer size for the codec to use (in bytes)
 * @param in  the base input stream to wrap around
 * @return an InputStream that uses the Snappy codec
 * @throws Exception if snappy is not available or an error occurs during reflection
 *//*from  w  w  w .  j a  v a2s .c o m*/
public InputStream getSnappyInputStream(int bufferSize, InputStream in) throws Exception {
    if (!isHadoopSnappyAvailable()) {
        throw new Exception("Hadoop-snappy does not seem to be available");
    }

    ClassLoader cl = Thread.currentThread().getContextClassLoader();
    Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
    try {
        SnappyCodec c = new SnappyCodec();
        Configuration newConf = new Configuration();
        newConf.set(IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_KEY, "" + bufferSize);
        c.setConf(newConf);
        return c.createInputStream(in);
    } finally {
        Thread.currentThread().setContextClassLoader(cl);
    }
}

From source file:org.pentaho.hadoop.shim.common.CommonSnappyShim.java

License:Apache License

/**
 * Gets an OutputStream that uses the snappy codec and wraps the supplied base output stream.
 *
 * @param the buffer size for the codec to use (in bytes)
 * @param out the base output stream to wrap around
 * @return a OutputStream that uses the Snappy codec
 * @throws Exception if snappy is not available or an error occurs during reflection
 *//*from   ww  w.  j a v  a 2s.  co m*/
public OutputStream getSnappyOutputStream(int bufferSize, OutputStream out) throws Exception {
    if (!isHadoopSnappyAvailable()) {
        throw new Exception("Hadoop-snappy does not seem to be available");
    }

    ClassLoader cl = Thread.currentThread().getContextClassLoader();
    Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
    try {
        SnappyCodec c = new SnappyCodec();
        Configuration newConf = new Configuration();
        newConf.set(IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_KEY, "" + bufferSize);
        c.setConf(newConf);
        return c.createOutputStream(out);
    } finally {
        Thread.currentThread().setContextClassLoader(cl);
    }
}