com.facebook.hive.orc.TestFileDump.java Source code

Introduction

Here is the source code for com.facebook.hive.orc.TestFileDump.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.facebook.hive.orc;

import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertNull;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.PrintStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;

import com.facebook.hive.orc.compression.CompressionKind;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.ReaderWriterProfiler;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.junit.Before;
import org.junit.Test;

import com.google.common.io.Resources;

public class TestFileDump {

    Path workDir = new Path(
            System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp"));

    Configuration conf;
    FileSystem fs;
    Path testFilePath;

    @Before
    public void openFileSystem() throws Exception {
        conf = new Configuration();
        fs = FileSystem.getLocal(conf);
        fs.mkdirs(workDir);
        testFilePath = new Path(workDir, "TestFileDump.testDump.orc");
        fs.delete(testFilePath, false);
    }

    static class MyRecord {
        int i;
        long l;
        String s;

        MyRecord(int i, long l, String s) {
            this.i = i;
            this.l = l;
            this.s = s;
        }
    }

    private static void checkOutput(String expected, String actual) throws Exception {
        BufferedReader eStream = new BufferedReader(new FileReader(expected));
        BufferedReader aStream = new BufferedReader(new FileReader(actual));
        String line = eStream.readLine();
        while (line != null) {
            assertEquals(line, aStream.readLine());
            line = eStream.readLine();
        }
        assertNull(eStream.readLine());
        assertNull(aStream.readLine());
    }

    /**
     * Calls FileDump on testFilePath and verifies the results match the contents of fileName.
     */
    private void checkOutput(String fileName) throws Exception {
        PrintStream origOut = System.out;
        URL expectedFileUrl = Resources.getResource(fileName);
        String outputFilename = workDir + File.separator + fileName;
        FileOutputStream myOut = new FileOutputStream(outputFilename);

        // replace stdout and run command
        System.setOut(new PrintStream(myOut));
        FileDump.main(new String[] { "-hiveconf", "conf1=val1", testFilePath.toString() });
        System.out.flush();
        System.setOut(origOut);

        checkOutput(expectedFileUrl.getPath(), outputFilename);
    }

    @Test
    public void testDump() throws Exception {
        ObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRecord.class,
                    ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
        }

        ReaderWriterProfiler.setProfilerOptions(conf);
        Writer writer = new WriterImpl(fs, testFilePath, conf, inspector, 100000, CompressionKind.SNAPPY, 10000,
                10000, new MemoryManager(conf));
        Random r1 = new Random(1);
        String[] words = new String[] { "It", "was", "the", "best", "of", "times,", "it", "was", "the", "worst",
                "of", "times,", "it", "was", "the", "age", "of", "wisdom,", "it", "was", "the", "age", "of",
                "foolishness,", "it", "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch", "of",
                "incredulity,", "it", "was", "the", "season", "of", "Light,", "it", "was", "the", "season", "of",
                "Darkness,", "it", "was", "the", "spring", "of", "hope,", "it", "was", "the", "winter", "of",
                "despair,", "we", "had", "everything", "before", "us,", "we", "had", "nothing", "before", "us,",
                "we", "were", "all", "going", "direct", "to", "Heaven,", "we", "were", "all", "going", "direct",
                "the", "other", "way" };
        for (int i = 0; i < 21000; ++i) {
            int curNum = r1.nextInt(words.length);

            writer.addRow(new MyRecord(curNum, (long) curNum + (long) Integer.MAX_VALUE, words[curNum]));
        }
        writer.close();
        checkOutput("orc-file-dump.out");
    }

    private void testDictionary(Configuration conf, String expectedOutputFilename) throws Exception {
        ObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRecord.class,
                    ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
        }
        // Turn off using the approximate entropy heuristic to turn off dictionary encoding
        OrcConf.setFloatVar(conf, OrcConf.ConfVars.HIVE_ORC_ENTROPY_KEY_STRING_SIZE_THRESHOLD, -1);
        ReaderWriterProfiler.setProfilerOptions(conf);
        Writer writer = new WriterImpl(fs, testFilePath, conf, inspector, 100000, CompressionKind.SNAPPY, 10000,
                10000, new MemoryManager(conf));
        Random r1 = new Random(1);
        String[] words = new String[] { "It", "was", "the", "best", "of", "times,", "it", "was", "the", "worst",
                "of", "times,", "it", "was", "the", "age", "of", "wisdom,", "it", "was", "the", "age", "of",
                "foolishness,", "it", "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch", "of",
                "incredulity,", "it", "was", "the", "season", "of", "Light,", "it", "was", "the", "season", "of",
                "Darkness,", "it", "was", "the", "spring", "of", "hope,", "it", "was", "the", "winter", "of",
                "despair,", "we", "had", "everything", "before", "us,", "we", "had", "nothing", "before", "us,",
                "we", "were", "all", "going", "direct", "to", "Heaven,", "we", "were", "all", "going", "direct",
                "the", "other", "way" };
        int nextInt = 0;
        int nextNumIdx = 0;
        int numRows = 21000;
        List<Integer> intVals = new ArrayList<Integer>(words.length);
        List<Long> longVals = new ArrayList<Long>(words.length);

        for (int i = 0; i < numRows; i++) {
            intVals.add(i);
            longVals.add((long) i + (long) Integer.MAX_VALUE);

        }
        Collections.shuffle(intVals, r1);
        Collections.shuffle(longVals, r1);

        for (int i = 0; i < numRows; ++i) {
            // Write out the same string twice, this guarantees the fraction of rows with
            // distinct strings is 0.5
            if (i % 2 == 0) {
                nextInt = r1.nextInt(words.length);
                nextNumIdx = i;
                // Append the value of i to the word, this guarantees when an index or word is repeated
                // the actual string is unique.
                words[nextInt] += "-" + i;
            }
            writer.addRow(new MyRecord(intVals.get(nextNumIdx), longVals.get(nextNumIdx), words[nextInt]));
        }
        writer.close();
        checkOutput(expectedOutputFilename);
    }

    //Test that if the number of distinct characters in distinct strings is less than the configured
    // threshold dictionary encoding is turned off.  If dictionary encoding is turned off the length
    // of the dictionary stream for the column will be 0 in the ORC file dump.
    @Test
    public void testEntropyThreshold() throws Exception {
        ObjectInspector inspector;
        synchronized (TestOrcFile.class) {
            inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRecord.class,
                    ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
        }
        OrcConf.setFloatVar(conf, OrcConf.ConfVars.HIVE_ORC_ENTROPY_KEY_STRING_SIZE_THRESHOLD, 1);
        OrcConf.setIntVar(conf, OrcConf.ConfVars.HIVE_ORC_ENTROPY_STRING_THRESHOLD, 11);
        // Make sure having too few distinct values won't turn off dictionary encoding
        OrcConf.setFloatVar(conf, OrcConf.ConfVars.HIVE_ORC_DICTIONARY_STRING_KEY_SIZE_THRESHOLD, 1);

        ReaderWriterProfiler.setProfilerOptions(conf);
        Writer writer = new WriterImpl(fs, testFilePath, conf, inspector, 100000, CompressionKind.SNAPPY, 10000,
                10000, new MemoryManager(conf));
        Random r1 = new Random(1);
        for (int i = 0; i < 21000; ++i) {
            writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(), Integer.toString(r1.nextInt())));
        }
        writer.close();
        checkOutput("orc-file-dump-entropy-threshold.out");
    }

    // Test that if the fraction of rows that have distinct strings is greater than the configured
    // threshold dictionary encoding is turned off.  If dictionary encoding is turned off the length
    // of the dictionary stream for the column will be 0 in the ORC file dump.
    @Test
    public void testDictionaryThreshold() throws Exception {
        OrcConf.setFloatVar(conf, OrcConf.ConfVars.HIVE_ORC_DICTIONARY_STRING_KEY_SIZE_THRESHOLD, 0.49f);
        OrcConf.setFloatVar(conf, OrcConf.ConfVars.HIVE_ORC_DICTIONARY_NUMERIC_KEY_SIZE_THRESHOLD, 0.49f);
        testDictionary(conf, "orc-file-dump-dictionary-threshold.out");
    }

    @Test
    public void testUnsortedDictionary() throws Exception {
        OrcConf.setFloatVar(conf, OrcConf.ConfVars.HIVE_ORC_DICTIONARY_STRING_KEY_SIZE_THRESHOLD, 0.49f);
        OrcConf.setFloatVar(conf, OrcConf.ConfVars.HIVE_ORC_DICTIONARY_NUMERIC_KEY_SIZE_THRESHOLD, 0.49f);
        OrcConf.setBoolVar(conf, OrcConf.ConfVars.HIVE_ORC_DICTIONARY_SORT_KEYS, false);
        testDictionary(conf, "orc-file-dump-dictionary-threshold-unsorted.out");

    }

    @Test
    public void testUnsortedDictionary2() throws Exception {
        OrcConf.setFloatVar(conf, OrcConf.ConfVars.HIVE_ORC_DICTIONARY_STRING_KEY_SIZE_THRESHOLD, 0.51f);
        OrcConf.setFloatVar(conf, OrcConf.ConfVars.HIVE_ORC_DICTIONARY_NUMERIC_KEY_SIZE_THRESHOLD, 0.51f);
        OrcConf.setBoolVar(conf, OrcConf.ConfVars.HIVE_ORC_DICTIONARY_SORT_KEYS, false);
        testDictionary(conf, "orc-file-dump-dictionary-threshold-unsorted2.out");
    }
}