Example usage for org.apache.hadoop.io.compress SnappyCodec SnappyCodec

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress SnappyCodec SnappyCodec.

Prototype

SnappyCodec

Source Link

Usage

From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.AbstractHoplog.java

License:Apache License

private static Option withCompression(Logger logger) {
    String prop = System.getProperty(HoplogConfig.COMPRESSION);
    if (prop != null) {
        CompressionCodec codec;/* ww w.  j  ava 2  s. co m*/
        if (prop.equalsIgnoreCase("SNAPPY")) {
            codec = new SnappyCodec();
        } else if (prop.equalsIgnoreCase("LZ4")) {
            codec = new Lz4Codec();
        } else if (prop.equals("GZ")) {
            codec = new GzipCodec();
        } else {
            throw new IllegalStateException("Unsupported codec: " + prop);
        }
        if (logger.isDebugEnabled())
            logger.debug("{}Using compression codec " + codec, logPrefix);
        return SequenceFile.Writer.compression(CompressionType.BLOCK, codec);
    }
    return SequenceFile.Writer.compression(CompressionType.NONE, null);
}

From source file:org.apache.carbondata.hadoop.csv.CSVInputFormatTest.java

License:Apache License

/**
 * generate compressed files, no need to call this method.
 * @throws Exception//from  w ww .j a va 2s  .c  o  m
 */
public void generateCompressFiles() throws Exception {
    String pwd = new File("src/test/resources").getCanonicalPath();
    String inputFile = pwd + "/data.csv";
    FileInputStream input = new FileInputStream(inputFile);
    Configuration conf = new Configuration();

    // .gz
    String outputFile = pwd + "/data.csv.gz";
    FileOutputStream output = new FileOutputStream(outputFile);
    GzipCodec gzip = new GzipCodec();
    gzip.setConf(conf);
    CompressionOutputStream outputStream = gzip.createOutputStream(output);
    int i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

    // .bz2
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.bz2";
    output = new FileOutputStream(outputFile);
    BZip2Codec bzip2 = new BZip2Codec();
    bzip2.setConf(conf);
    outputStream = bzip2.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

    // .snappy
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.snappy";
    output = new FileOutputStream(outputFile);
    SnappyCodec snappy = new SnappyCodec();
    snappy.setConf(conf);
    outputStream = snappy.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

    //.lz4
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.lz4";
    output = new FileOutputStream(outputFile);
    Lz4Codec lz4 = new Lz4Codec();
    lz4.setConf(conf);
    outputStream = lz4.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

}

From source file:org.apache.carbondata.processing.csvload.CSVInputFormatTest.java

License:Apache License

/**
 * generate compressed files, no need to call this method.
 * @throws Exception// w  ww  .  j  a  v  a  2s.c o m
 */
public void generateCompressFiles() throws Exception {
    String pwd = new File("src/test/resources/csv").getCanonicalPath();
    String inputFile = pwd + "/data.csv";
    FileInputStream input = new FileInputStream(inputFile);
    Configuration conf = new Configuration();

    // .gz
    String outputFile = pwd + "/data.csv.gz";
    FileOutputStream output = new FileOutputStream(outputFile);
    GzipCodec gzip = new GzipCodec();
    gzip.setConf(conf);
    CompressionOutputStream outputStream = gzip.createOutputStream(output);
    int i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

    // .bz2
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.bz2";
    output = new FileOutputStream(outputFile);
    BZip2Codec bzip2 = new BZip2Codec();
    bzip2.setConf(conf);
    outputStream = bzip2.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

    // .snappy
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.snappy";
    output = new FileOutputStream(outputFile);
    SnappyCodec snappy = new SnappyCodec();
    snappy.setConf(conf);
    outputStream = snappy.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

    //.lz4
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.lz4";
    output = new FileOutputStream(outputFile);
    Lz4Codec lz4 = new Lz4Codec();
    lz4.setConf(conf);
    outputStream = lz4.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();

}

From source file:org.apache.ignite.internal.processors.hadoop.HadoopSnappyTest.java

License:Apache License

/**
 * Internal check routine./*w w w.  j a v a2  s . c om*/
 *
 * @throws Throwable If failed.
 */
public static void checkSnappy() throws Throwable {
    try {
        byte[] expBytes = new byte[BYTE_SIZE];
        byte[] actualBytes = new byte[BYTE_SIZE];

        for (int i = 0; i < expBytes.length; i++)
            expBytes[i] = (byte) ThreadLocalRandom.current().nextInt(16);

        SnappyCodec codec = new SnappyCodec();

        codec.setConf(new Configuration());

        ByteArrayOutputStream baos = new ByteArrayOutputStream();

        try (CompressionOutputStream cos = codec.createOutputStream(baos)) {
            cos.write(expBytes);
            cos.flush();
        }

        try (CompressionInputStream cis = codec
                .createInputStream(new ByteArrayInputStream(baos.toByteArray()))) {
            int read = cis.read(actualBytes, 0, actualBytes.length);

            assert read == actualBytes.length;
        }

        assert Arrays.equals(expBytes, actualBytes);
    } catch (Throwable e) {
        System.out.println("Snappy check failed:");
        System.out
                .println("### NativeCodeLoader.isNativeCodeLoaded:  " + NativeCodeLoader.isNativeCodeLoaded());
        System.out
                .println("### SnappyCompressor.isNativeCodeLoaded:  " + SnappyCompressor.isNativeCodeLoaded());

        throw e;
    }
}

From source file:org.commoncrawl.util.CrawlLogSplitter.java

License:Open Source License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    FileStatus arcFiles[] = fs.globStatus(new Path("crawl/checkpoint_data/CrawlLog_*"));
    for (FileStatus candidate : arcFiles) {
        if (candidate.getLen() > SPLIT_SIZE) {
            candidateList.add(candidate.getPath());
        }/*from  w w w.j  a  v  a  2  s  . c om*/
    }

    LOG.info("Found:" + candidateList.size() + " oversized candidates");

    Path tempOutputDir = new Path(conf.get("mapred.temp.dir", "."));

    while (candidateList.size() != 0) {
        Path candidateName = candidateList.first();
        candidateList.remove(candidateName);

        LOG.info("Processing Candidate:" + candidateName);
        long fileSize = fs.getFileStatus(candidateName).getLen();
        //get crawl log filename components

        ArrayList<Path> splitItems = new ArrayList<Path>();

        int index = 0;

        Path outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir, candidateName.getName(), index);

        LOG.info("Initial Output Path is:" + outputPart);

        fs.delete(outputPart, false);

        // create reader 
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, candidateName, conf);
        ValueBytes sourceVB = reader.createValueBytes();
        DataOutputBuffer sourceKeyData = new DataOutputBuffer();

        try {
            // ok create temp file 
            SequenceFile.Writer activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());

            // add to split items array 
            splitItems.add(outputPart);

            try {
                long recordsWritten = 0;
                while (reader.nextRawKey(sourceKeyData) != -1) {
                    reader.nextRawValue(sourceVB);
                    long lengthPreWrite = activeWriter.getLength();
                    activeWriter.appendRaw(sourceKeyData.getData(), 0, sourceKeyData.getLength(), sourceVB);
                    if (++recordsWritten % 10000 == 0) {
                        LOG.info("Write 10000 records");
                    }
                    long lengthPostWrite = activeWriter.getLength();
                    if (lengthPostWrite != lengthPreWrite) {
                        if (lengthPostWrite >= IDEAL_SIZE) {
                            LOG.info("Hit Split Point. Flushing File:" + outputPart);
                            activeWriter.close();
                            outputPart = buildIncrementalPathGivenPathAndIndex(tempOutputDir,
                                    candidateName.getName(), ++index);
                            LOG.info("Creating New File:" + outputPart);
                            activeWriter = SequenceFile.createWriter(fs, conf, outputPart, Text.class,
                                    CrawlURL.class, CompressionType.BLOCK, new SnappyCodec());
                            splitItems.add(outputPart);
                        }
                    }
                    sourceKeyData.reset();
                }
            } finally {
                activeWriter.close();
            }
        } finally {
            reader.close();
        }
        LOG.info("Rewrote Source:" + candidateName + " into:" + splitItems.size() + " split files");
        for (Path splitItem : splitItems) {
            Path destPath = new Path("crawl/checkpoint_data", splitItem.getName());
            LOG.info("Moving:" + splitItem + " to:" + destPath);
            fs.rename(splitItem, destPath);
        }
        Path sourceMoveLocation = new Path("crawl/checkpoint_data_split", candidateName.getName());
        LOG.info("Moving SOURCE:" + candidateName + " to:" + sourceMoveLocation);
        fs.rename(candidateName, sourceMoveLocation);
    }
}

From source file:org.pentaho.hadoop.shim.common.CommonSnappyShim.java

License:Apache License

/**
 * Gets an InputStream that uses the snappy codec and wraps the supplied base input stream.
 *
 * @param the buffer size for the codec to use (in bytes)
 * @param in  the base input stream to wrap around
 * @return an InputStream that uses the Snappy codec
 * @throws Exception if snappy is not available or an error occurs during reflection
 *//*from  w  w  w .  j a  v a2s .c o m*/
public InputStream getSnappyInputStream(int bufferSize, InputStream in) throws Exception {
    if (!isHadoopSnappyAvailable()) {
        throw new Exception("Hadoop-snappy does not seem to be available");
    }

    ClassLoader cl = Thread.currentThread().getContextClassLoader();
    Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
    try {
        SnappyCodec c = new SnappyCodec();
        Configuration newConf = new Configuration();
        newConf.set(IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_KEY, "" + bufferSize);
        c.setConf(newConf);
        return c.createInputStream(in);
    } finally {
        Thread.currentThread().setContextClassLoader(cl);
    }
}

From source file:org.pentaho.hadoop.shim.common.CommonSnappyShim.java

License:Apache License

/**
 * Gets an OutputStream that uses the snappy codec and wraps the supplied base output stream.
 *
 * @param the buffer size for the codec to use (in bytes)
 * @param out the base output stream to wrap around
 * @return a OutputStream that uses the Snappy codec
 * @throws Exception if snappy is not available or an error occurs during reflection
 *//*from   ww  w.  j a v  a 2s.  co m*/
public OutputStream getSnappyOutputStream(int bufferSize, OutputStream out) throws Exception {
    if (!isHadoopSnappyAvailable()) {
        throw new Exception("Hadoop-snappy does not seem to be available");
    }

    ClassLoader cl = Thread.currentThread().getContextClassLoader();
    Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
    try {
        SnappyCodec c = new SnappyCodec();
        Configuration newConf = new Configuration();
        newConf.set(IO_COMPRESSION_CODEC_SNAPPY_BUFFERSIZE_KEY, "" + bufferSize);
        c.setConf(newConf);
        return c.createOutputStream(out);
    } finally {
        Thread.currentThread().setContextClassLoader(cl);
    }
}