com.basistech.rosette.dm.json.array.CompareJsons.java Source code

Java tutorial

Introduction

Here is the source code for com.basistech.rosette.dm.json.array.CompareJsons.java

Source

/*
* Copyright 2014 Basis Technology Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.basistech.rosette.dm.json.array;

import com.basistech.rosette.dm.AnnotatedText;
import com.basistech.rosette.dm.jackson.AnnotatedDataModelModule;
import com.basistech.rosette.dm.jackson.array.AnnotatedDataModelArrayModule;
import com.codahale.metrics.ConsoleReporter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.dataformat.cbor.CBORFactory;
import com.fasterxml.jackson.dataformat.smile.SmileFactory;
import com.google.common.io.ByteStreams;
import org.apache.commons.math3.stat.descriptive.AbstractStorelessUnivariateStatistic;
import org.apache.commons.math3.stat.descriptive.StorelessUnivariateStatistic;
import org.apache.commons.math3.stat.descriptive.moment.Mean;
import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation;
import org.xerial.snappy.Snappy;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

/**
 * Quick command line to see what we've achieved.
 */
public final class CompareJsons {

    static class CompressionStats extends AbstractStorelessUnivariateStatistic {
        private final Mean mean;
        private final StandardDeviation standardDeviation;

        CompressionStats() {
            mean = new Mean();
            standardDeviation = new StandardDeviation();
        }

        @Override
        public StorelessUnivariateStatistic copy() {
            throw new UnsupportedOperationException();
        }

        @Override
        public void clear() {
            mean.clear();
            standardDeviation.clear();

        }

        @Override
        public double getResult() {
            throw new UnsupportedOperationException();
        }

        @Override
        public long getN() {
            return mean.getN();
        }

        @Override
        public void increment(double d) {
            mean.increment(d);
            standardDeviation.increment(d);
        }

        double getMean() {
            return mean.getResult();
        }

        double getStandardDeviation() {
            return standardDeviation.getResult();
        }

    }

    interface FactoryFactory {
        JsonFactory newFactory();
    }

    private CompareJsons() {
        //
    }

    public static void main(String[] args) throws Exception {
        File plenty = new File(args[0]);
        System.out.println(String.format("Original file length %d", plenty.length()));
        ObjectMapper inputMapper = AnnotatedDataModelModule.setupObjectMapper(new ObjectMapper());
        AnnotatedText[] texts = inputMapper.readValue(plenty, AnnotatedText[].class);
        System.out.println(String.format("%d documents", texts.length));
        runWithFormat(texts, new FactoryFactory() {
            @Override
            public JsonFactory newFactory() {
                return new JsonFactory();
            }
        }, "Plain");
        runWithFormat(texts, new FactoryFactory() {
            @Override
            public JsonFactory newFactory() {
                return new SmileFactory();
            }
        }, "SMILE");

        runWithFormat(texts, new FactoryFactory() {
            @Override
            public JsonFactory newFactory() {
                return new CBORFactory();
            }
        }, "CBOR");
    }

    //CHECKSTYLE:OFF
    private static void runWithFormat(AnnotatedText[] texts, FactoryFactory factoryFactory, String format)
            throws IOException {

        MetricRegistry metrics = new MetricRegistry();
        ConsoleReporter reporter = ConsoleReporter.forRegistry(metrics).convertRatesTo(TimeUnit.SECONDS)
                .convertDurationsTo(TimeUnit.MILLISECONDS).build();

        // 'normal' means 'classic json textual format', as opposed to array.
        ObjectMapper normalMapper = AnnotatedDataModelModule
                .setupObjectMapper(new ObjectMapper(factoryFactory.newFactory()));
        ObjectMapper arrayMapper = AnnotatedDataModelArrayModule
                .setupObjectMapper(new ObjectMapper(factoryFactory.newFactory()));

        // times with Metrics
        Timer normalSerialTime = metrics.timer(String.format("%s-normal-serial", format));
        Timer arraySerialTime = metrics.timer(String.format("%s-array-serial", format));
        Timer normalDeserialTime = metrics.timer(String.format("%s-normal-deserial", format));
        Timer arrayDeserialTime = metrics.timer(String.format("%s-array-deserial", format));

        CompressionStats normalSizeStats = new CompressionStats();
        CompressionStats normalGzipStats = new CompressionStats();
        CompressionStats normalSnappyStats = new CompressionStats();

        CompressionStats arraySizeStats = new CompressionStats();
        CompressionStats arrayGzipStats = new CompressionStats();
        CompressionStats arraySnappyStats = new CompressionStats();

        Timer normalGzipCompressTime = metrics.timer(String.format("%s-gzip-compress-normal", format));
        Timer normalGzipDecompressTime = metrics.timer(String.format("%s-gzip-decompress-normal", format));
        Timer normalSnappyCompressTime = metrics.timer(String.format("%s-snappy-compress-normal", format));
        Timer normalSnappyDecompressTime = metrics.timer(String.format("%s-snappy-decompress-normal", format));
        Timer arrayGzipCompressTime = metrics.timer(String.format("%s-gzip-compress-array", format));
        Timer arrayGzipDecompressTime = metrics.timer(String.format("%s-gzip-decompress-array", format));
        Timer arraySnappyCompressTime = metrics.timer(String.format("%s-snappy-compress-array", format));
        Timer arraySnappyDecompressTime = metrics.timer(String.format("%s-snappy-decompress-array", format));

        for (AnnotatedText text : texts) {
            // text and array time and size
            Timer.Context ctxt = normalSerialTime.time();
            byte[] textJson = normalMapper.writeValueAsBytes(text);
            ctxt.stop();
            normalSizeStats.increment(textJson.length);

            // Time reading it back ...
            ctxt = normalDeserialTime.time();
            normalMapper.readValue(textJson, AnnotatedText.class);
            ctxt.stop();

            ctxt = arraySerialTime.time();
            byte[] arrayJson = arrayMapper.writeValueAsBytes(text);
            ctxt.stop();
            arraySizeStats.increment(arrayJson.length);

            // time reading it back
            ctxt = arrayDeserialTime.time();
            arrayMapper.readValue(arrayJson, AnnotatedText.class);
            ctxt.stop();

            // gzip time and space
            ctxt = normalGzipCompressTime.time();
            byte[] compressed = gzipCompress(textJson);
            int compLen = compressed.length;
            ctxt.stop();
            normalGzipStats.increment(compressionRatio(textJson.length, compLen));

            // decompression?
            ctxt = normalGzipDecompressTime.time();
            gzipDecompress(compressed);
            ctxt.stop();

            ctxt = arrayGzipCompressTime.time();
            compressed = gzipCompress(arrayJson);
            compLen = compressed.length;
            ctxt.stop();
            arrayGzipStats.increment(compressionRatio(arrayJson.length, compLen));

            // decompression?
            ctxt = arrayGzipDecompressTime.time();
            gzipDecompress(compressed);
            ctxt.stop();

            // snappy time and space
            ctxt = normalSnappyCompressTime.time();
            compressed = snappyCompress(textJson);
            compLen = compressed.length;
            ctxt.stop();
            normalSnappyStats.increment(compressionRatio(textJson.length, compLen));

            // decompression?
            ctxt = normalSnappyDecompressTime.time();
            snappyDecompress(compressed);
            ctxt.stop();

            ctxt = arraySnappyCompressTime.time();
            compressed = snappyCompress(arrayJson);
            compLen = compressed.length;
            ctxt.stop();
            arraySnappyStats.increment(compressionRatio(arrayJson.length, compLen));

            // decompression?
            ctxt = arraySnappyDecompressTime.time();
            snappyDecompress(compressed);
            ctxt.stop();
        }

        System.out.println("\nStatistics for " + format);
        System.out.println();
        System.out.format("Normal Size: mean %.2f stddev %.2f\n", normalSizeStats.getMean(),
                normalSizeStats.getStandardDeviation());
        System.out.format("Normal GZIP Compression ratio: mean %.2f stddev %.2f\n", normalGzipStats.getMean(),
                normalGzipStats.getStandardDeviation());
        System.out.format("Normal Snappy Compression ratio: mean %.2f stddev %.2f\n", normalSnappyStats.getMean(),
                normalSnappyStats.getStandardDeviation());

        System.out.format("Array Size: mean %.2f stddev %.2f\n", arraySizeStats.getMean(),
                arraySizeStats.getStandardDeviation());
        System.out.format("Array GZIP Compression ratio: mean %.2f stddev %.2f\n", arrayGzipStats.getMean(),
                arrayGzipStats.getStandardDeviation());
        System.out.format("Array Snappy Compression ratio: mean %.2f stddev %.2f\n", arraySnappyStats.getMean(),
                arraySnappyStats.getStandardDeviation());
        System.out.println();
        reporter.report();
        reporter.stop();

    }

    private static double compressionRatio(int uncompressed, int compressed) {
        return ((double) (uncompressed - compressed)) / (double) uncompressed;
    }

    private static byte[] gzipCompress(byte[] data) {
        ByteArrayOutputStream sink = new ByteArrayOutputStream();
        try {
            GZIPOutputStream compressedStream = new GZIPOutputStream(sink);
            ByteStreams.copy(new ByteArrayInputStream(data), compressedStream);
            compressedStream.close();
            return sink.toByteArray();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private static void gzipDecompress(byte[] data) {
        ByteArrayOutputStream sink = new ByteArrayOutputStream();
        try {
            GZIPInputStream compressedStream = new GZIPInputStream(new ByteArrayInputStream(data));
            ByteStreams.copy(compressedStream, sink);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

    }

    private static byte[] snappyCompress(byte[] data) {
        try {
            return Snappy.compress(data);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private static void snappyDecompress(byte[] data) {
        try {
            Snappy.uncompress(data);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}