org.apache.orc.TestColumnStatistics.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.orc.TestColumnStatistics.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.orc;

import static junit.framework.Assert.assertEquals;
import static org.junit.Assume.assumeTrue;

import java.io.File;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.sql.Timestamp;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.orc.impl.ColumnStatisticsImpl;
import org.apache.orc.tools.FileDump;
import org.apache.orc.tools.TestFileDump;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;

/**
 * Test ColumnStatisticsImpl for ORC.
 */
public class TestColumnStatistics {

    @Test
    public void testLongMerge() throws Exception {
        TypeDescription schema = TypeDescription.createInt();

        ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
        ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
        stats1.updateInteger(10, 2);
        stats2.updateInteger(1, 1);
        stats2.updateInteger(1000, 1);
        stats1.merge(stats2);
        IntegerColumnStatistics typed = (IntegerColumnStatistics) stats1;
        assertEquals(1, typed.getMinimum());
        assertEquals(1000, typed.getMaximum());
        stats1.reset();
        stats1.updateInteger(-10, 1);
        stats1.updateInteger(10000, 1);
        stats1.merge(stats2);
        assertEquals(-10, typed.getMinimum());
        assertEquals(10000, typed.getMaximum());
    }

    @Test
    public void testDoubleMerge() throws Exception {
        TypeDescription schema = TypeDescription.createDouble();

        ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
        ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
        stats1.updateDouble(10.0);
        stats1.updateDouble(100.0);
        stats2.updateDouble(1.0);
        stats2.updateDouble(1000.0);
        stats1.merge(stats2);
        DoubleColumnStatistics typed = (DoubleColumnStatistics) stats1;
        assertEquals(1.0, typed.getMinimum(), 0.001);
        assertEquals(1000.0, typed.getMaximum(), 0.001);
        stats1.reset();
        stats1.updateDouble(-10);
        stats1.updateDouble(10000);
        stats1.merge(stats2);
        assertEquals(-10, typed.getMinimum(), 0.001);
        assertEquals(10000, typed.getMaximum(), 0.001);
    }

    @Test
    public void testStringMerge() throws Exception {
        TypeDescription schema = TypeDescription.createString();

        ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
        ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
        stats1.updateString(new Text("bob"));
        stats1.updateString(new Text("david"));
        stats1.updateString(new Text("charles"));
        stats2.updateString(new Text("anne"));
        byte[] erin = new byte[] { 0, 1, 2, 3, 4, 5, 101, 114, 105, 110 };
        stats2.updateString(erin, 6, 4, 5);
        assertEquals(24, ((StringColumnStatistics) stats2).getSum());
        stats1.merge(stats2);
        StringColumnStatistics typed = (StringColumnStatistics) stats1;
        assertEquals("anne", typed.getMinimum());
        assertEquals("erin", typed.getMaximum());
        assertEquals(39, typed.getSum());
        stats1.reset();
        stats1.updateString(new Text("aaa"));
        stats1.updateString(new Text("zzz"));
        stats1.merge(stats2);
        assertEquals("aaa", typed.getMinimum());
        assertEquals("zzz", typed.getMaximum());
    }

    @Test
    public void testDateMerge() throws Exception {
        TypeDescription schema = TypeDescription.createDate();

        ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
        ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
        stats1.updateDate(new DateWritable(1000));
        stats1.updateDate(new DateWritable(100));
        stats2.updateDate(new DateWritable(10));
        stats2.updateDate(new DateWritable(2000));
        stats1.merge(stats2);
        DateColumnStatistics typed = (DateColumnStatistics) stats1;
        assertEquals(new DateWritable(10).get(), typed.getMinimum());
        assertEquals(new DateWritable(2000).get(), typed.getMaximum());
        stats1.reset();
        stats1.updateDate(new DateWritable(-10));
        stats1.updateDate(new DateWritable(10000));
        stats1.merge(stats2);
        assertEquals(new DateWritable(-10).get(), typed.getMinimum());
        assertEquals(new DateWritable(10000).get(), typed.getMaximum());
    }

    @Test
    public void testTimestampMerge() throws Exception {
        TypeDescription schema = TypeDescription.createTimestamp();

        ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
        ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
        stats1.updateTimestamp(new Timestamp(10));
        stats1.updateTimestamp(new Timestamp(100));
        stats2.updateTimestamp(new Timestamp(1));
        stats2.updateTimestamp(new Timestamp(1000));
        stats1.merge(stats2);
        TimestampColumnStatistics typed = (TimestampColumnStatistics) stats1;
        assertEquals(1, typed.getMinimum().getTime());
        assertEquals(1000, typed.getMaximum().getTime());
        stats1.reset();
        stats1.updateTimestamp(new Timestamp(-10));
        stats1.updateTimestamp(new Timestamp(10000));
        stats1.merge(stats2);
        assertEquals(-10, typed.getMinimum().getTime());
        assertEquals(10000, typed.getMaximum().getTime());
    }

    @Test
    public void testDecimalMerge() throws Exception {
        TypeDescription schema = TypeDescription.createDecimal().withPrecision(38).withScale(16);

        ColumnStatisticsImpl stats1 = ColumnStatisticsImpl.create(schema);
        ColumnStatisticsImpl stats2 = ColumnStatisticsImpl.create(schema);
        stats1.updateDecimal(HiveDecimal.create(10));
        stats1.updateDecimal(HiveDecimal.create(100));
        stats2.updateDecimal(HiveDecimal.create(1));
        stats2.updateDecimal(HiveDecimal.create(1000));
        stats1.merge(stats2);
        DecimalColumnStatistics typed = (DecimalColumnStatistics) stats1;
        assertEquals(1, typed.getMinimum().longValue());
        assertEquals(1000, typed.getMaximum().longValue());
        stats1.reset();
        stats1.updateDecimal(HiveDecimal.create(-10));
        stats1.updateDecimal(HiveDecimal.create(10000));
        stats1.merge(stats2);
        assertEquals(-10, typed.getMinimum().longValue());
        assertEquals(10000, typed.getMaximum().longValue());
    }

    Path workDir = new Path(
            System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp"));

    Configuration conf;
    FileSystem fs;
    Path testFilePath;

    @Rule
    public TestName testCaseName = new TestName();

    @Before
    public void openFileSystem() throws Exception {
        conf = new Configuration();
        fs = FileSystem.getLocal(conf);
        fs.setWorkingDirectory(workDir);
        testFilePath = new Path("TestOrcFile." + testCaseName.getMethodName() + ".orc");
        fs.delete(testFilePath, false);
    }

    private static BytesWritable bytes(int... items) {
        BytesWritable result = new BytesWritable();
        result.setSize(items.length);
        for (int i = 0; i < items.length; ++i) {
            result.getBytes()[i] = (byte) items[i];
        }
        return result;
    }

    void appendRow(VectorizedRowBatch batch, BytesWritable bytes, String str) {
        int row = batch.size++;
        if (bytes == null) {
            batch.cols[0].noNulls = false;
            batch.cols[0].isNull[row] = true;
        } else {
            ((BytesColumnVector) batch.cols[0]).setVal(row, bytes.getBytes(), 0, bytes.getLength());
        }
        if (str == null) {
            batch.cols[1].noNulls = false;
            batch.cols[1].isNull[row] = true;
        } else {
            ((BytesColumnVector) batch.cols[1]).setVal(row, str.getBytes());
        }
    }

    @Test
    public void testHasNull() throws Exception {
        TypeDescription schema = TypeDescription.createStruct().addField("bytes1", TypeDescription.createBinary())
                .addField("string1", TypeDescription.createString());
        Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).setSchema(schema)
                .rowIndexStride(1000).stripeSize(10000).bufferSize(10000));
        VectorizedRowBatch batch = schema.createRowBatch(5000);
        // STRIPE 1
        // RG1
        for (int i = 0; i < 1000; i++) {
            appendRow(batch, bytes(1, 2, 3), "RG1");
        }
        writer.addRowBatch(batch);
        batch.reset();
        // RG2
        for (int i = 0; i < 1000; i++) {
            appendRow(batch, bytes(1, 2, 3), null);
        }
        writer.addRowBatch(batch);
        batch.reset();
        // RG3
        for (int i = 0; i < 1000; i++) {
            appendRow(batch, bytes(1, 2, 3), "RG3");
        }
        writer.addRowBatch(batch);
        batch.reset();
        // RG4
        for (int i = 0; i < 1000; i++) {
            appendRow(batch, bytes(1, 2, 3), null);
        }
        writer.addRowBatch(batch);
        batch.reset();
        // RG5
        for (int i = 0; i < 1000; i++) {
            appendRow(batch, bytes(1, 2, 3), null);
        }
        writer.addRowBatch(batch);
        batch.reset();
        // STRIPE 2
        for (int i = 0; i < 5000; i++) {
            appendRow(batch, bytes(1, 2, 3), null);
        }
        writer.addRowBatch(batch);
        batch.reset();
        // STRIPE 3
        for (int i = 0; i < 5000; i++) {
            appendRow(batch, bytes(1, 2, 3), "STRIPE-3");
        }
        writer.addRowBatch(batch);
        batch.reset();
        // STRIPE 4
        for (int i = 0; i < 5000; i++) {
            appendRow(batch, bytes(1, 2, 3), null);
        }
        writer.addRowBatch(batch);
        batch.reset();
        writer.close();
        Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));

        // check the file level stats
        ColumnStatistics[] stats = reader.getStatistics();
        assertEquals(20000, stats[0].getNumberOfValues());
        assertEquals(20000, stats[1].getNumberOfValues());
        assertEquals(7000, stats[2].getNumberOfValues());
        assertEquals(false, stats[0].hasNull());
        assertEquals(false, stats[1].hasNull());
        assertEquals(true, stats[2].hasNull());

        // check the stripe level stats
        List<StripeStatistics> stripeStats = reader.getStripeStatistics();
        // stripe 1 stats
        StripeStatistics ss1 = stripeStats.get(0);
        ColumnStatistics ss1_cs1 = ss1.getColumnStatistics()[0];
        ColumnStatistics ss1_cs2 = ss1.getColumnStatistics()[1];
        ColumnStatistics ss1_cs3 = ss1.getColumnStatistics()[2];
        assertEquals(false, ss1_cs1.hasNull());
        assertEquals(false, ss1_cs2.hasNull());
        assertEquals(true, ss1_cs3.hasNull());

        // stripe 2 stats
        StripeStatistics ss2 = stripeStats.get(1);
        ColumnStatistics ss2_cs1 = ss2.getColumnStatistics()[0];
        ColumnStatistics ss2_cs2 = ss2.getColumnStatistics()[1];
        ColumnStatistics ss2_cs3 = ss2.getColumnStatistics()[2];
        assertEquals(false, ss2_cs1.hasNull());
        assertEquals(false, ss2_cs2.hasNull());
        assertEquals(true, ss2_cs3.hasNull());

        // stripe 3 stats
        StripeStatistics ss3 = stripeStats.get(2);
        ColumnStatistics ss3_cs1 = ss3.getColumnStatistics()[0];
        ColumnStatistics ss3_cs2 = ss3.getColumnStatistics()[1];
        ColumnStatistics ss3_cs3 = ss3.getColumnStatistics()[2];
        assertEquals(false, ss3_cs1.hasNull());
        assertEquals(false, ss3_cs2.hasNull());
        assertEquals(false, ss3_cs3.hasNull());

        // stripe 4 stats
        StripeStatistics ss4 = stripeStats.get(3);
        ColumnStatistics ss4_cs1 = ss4.getColumnStatistics()[0];
        ColumnStatistics ss4_cs2 = ss4.getColumnStatistics()[1];
        ColumnStatistics ss4_cs3 = ss4.getColumnStatistics()[2];
        assertEquals(false, ss4_cs1.hasNull());
        assertEquals(false, ss4_cs2.hasNull());
        assertEquals(true, ss4_cs3.hasNull());

        // Test file dump
        PrintStream origOut = System.out;
        String outputFilename = "orc-file-has-null.out";
        FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename);

        // replace stdout and run command
        System.setOut(new PrintStream(myOut));
        FileDump.main(new String[] { testFilePath.toString(), "--rowindex=2" });
        System.out.flush();
        System.setOut(origOut);
        // If called with an expression evaluating to false, the test will halt
        // and be ignored.
        assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
        TestFileDump.checkOutput(outputFilename, workDir + File.separator + outputFilename);
    }
}