org.apache.orc.mapred.TestOrcFileEvolution.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.orc.mapred.TestOrcFileEvolution.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * <p>
 * http://www.apache.org/licenses/LICENSE-2.0
 * <p>
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.orc.mapred;

import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.Reporter;
import org.apache.orc.*;
import org.apache.orc.TypeDescription.Category;
import org.apache.orc.impl.SchemaEvolution;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;
import org.junit.rules.TestName;

import java.io.File;
import java.io.IOException;
import java.util.*;

import static org.junit.Assert.assertEquals;
import static org.mockito.Mockito.mock;

/**
 * Test the behavior of ORC's schema evolution
 */
public class TestOrcFileEvolution {

    // These utility methods are just to make writing tests easier. The values
    // created here will not feed directly to the ORC writers, but are converted
    // within checkEvolution().
    private List<Object> struct(Object... fields) {
        return list(fields);
    }

    private List<Object> list(Object... elements) {
        return Arrays.asList(elements);
    }

    private Map<Object, Object> map(Object... kvs) {
        if (kvs.length != 2) {
            throw new IllegalArgumentException("Map must be provided an even number of arguments");
        }

        Map<Object, Object> result = new HashMap<>();
        for (int i = 0; i < kvs.length; i += 2) {
            result.put(kvs[i], kvs[i + 1]);
        }
        return result;
    }

    Path workDir = new Path(
            System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp"));

    Configuration conf;
    FileSystem fs;
    Path testFilePath;

    @Rule
    public TestName testCaseName = new TestName();

    @Rule
    public ExpectedException thrown = ExpectedException.none();

    @Before
    public void openFileSystem() throws Exception {
        conf = new Configuration();
        fs = FileSystem.getLocal(conf);
        testFilePath = new Path(workDir, "TestOrcFile." + testCaseName.getMethodName() + ".orc");
        fs.delete(testFilePath, false);
    }

    @Test
    public void testAddFieldToEnd() {
        checkEvolution("struct<a:int,b:string>", "struct<a:int,b:string,c:double>", struct(1, "foo"),
                struct(1, "foo", null));
    }

    @Test
    public void testAddFieldBeforeEnd() {
        checkEvolution("struct<a:int,b:string>", "struct<a:int,c:double,b:string>", struct(1, "foo"),
                struct(1, null, "foo"));
    }

    @Test
    public void testRemoveLastField() {
        checkEvolution("struct<a:int,b:string,c:double>", "struct<a:int,b:string>", struct(1, "foo", 3.14),
                struct(1, "foo"));
    }

    @Test
    public void testRemoveFieldBeforeEnd() {
        checkEvolution("struct<a:int,b:string,c:double>", "struct<a:int,c:double>", struct(1, "foo", 3.14),
                struct(1, 3.14));
    }

    @Test
    public void testRemoveAndAddField() {
        checkEvolution("struct<a:int,b:string>", "struct<a:int,c:double>", struct(1, "foo"), struct(1, null));
    }

    @Test
    public void testReorderFields() {
        checkEvolution("struct<a:int,b:string>", "struct<b:string,a:int>", struct(1, "foo"), struct("foo", 1));
    }

    @Test
    public void testAddFieldEndOfStruct() {
        checkEvolution("struct<a:struct<b:int>,c:string>", "struct<a:struct<b:int,d:double>,c:string>",
                struct(struct(2), "foo"), struct(struct(2, null), "foo"));
    }

    @Test
    public void testAddFieldBeforeEndOfStruct() {
        checkEvolution("struct<a:struct<b:int>,c:string>", "struct<a:struct<d:double,b:int>,c:string>",
                struct(struct(2), "foo"), struct(struct(null, 2), "foo"));
    }

    @Test
    public void testAddSimilarField() {
        checkEvolution("struct<a:struct<b:int>>", "struct<a:struct<b:int>,c:struct<b:int>>", struct(struct(2)),
                struct(struct(2), null));
    }

    @Test
    public void testConvergentEvolution() {
        checkEvolution("struct<a:struct<a:int,b:string>,c:struct<a:int>>",
                "struct<a:struct<a:int,b:string>,c:struct<a:int,b:string>>", struct(struct(2, "foo"), struct(3)),
                struct(struct(2, "foo"), struct(3, null)));
    }

    @Test
    public void testMapKeyEvolution() {
        checkEvolution("struct<a:map<struct<a:int>,int>>", "struct<a:map<struct<a:int,b:string>,int>>",
                struct(map(struct(1), 2)), struct(map(struct(1, null), 2)));
    }

    @Test
    public void testMapValueEvolution() {
        checkEvolution("struct<a:map<int,struct<a:int>>>", "struct<a:map<int,struct<a:int,b:string>>>",
                struct(map(2, struct(1))), struct(map(2, struct(1, null))));
    }

    @Test
    public void testListEvolution() {
        checkEvolution("struct<a:array<struct<b:int>>>", "struct<a:array<struct<b:int,c:string>>>",
                struct(list(struct(1), struct(2))), struct(list(struct(1, null), struct(2, null))));
    }

    @Test
    public void testPreHive4243CheckEqual() {
        // Expect success on equal schemas
        checkEvolution("struct<_col0:int,_col1:string>", "struct<_col0:int,_col1:string>", struct(1, "foo"),
                struct(1, "foo", null), false);
    }

    @Test
    public void testPreHive4243Check() {
        // Expect exception on strict compatibility check
        thrown.expectMessage("HIVE-4243");
        checkEvolution("struct<_col0:int,_col1:string>", "struct<_col0:int,_col1:string,_col2:double>",
                struct(1, "foo"), struct(1, "foo", null), false);
    }

    @Test
    public void testPreHive4243AddColumn() {
        checkEvolution("struct<_col0:int,_col1:string>", "struct<_col0:int,_col1:string,_col2:double>",
                struct(1, "foo"), struct(1, "foo", null), true);
    }

    @Test
    public void testPreHive4243AddColumnMiddle() {
        // Expect exception on type mismatch
        thrown.expect(SchemaEvolution.IllegalEvolutionException.class);
        checkEvolution("struct<_col0:int,_col1:double>", "struct<_col0:int,_col1:date,_col2:double>",
                struct(1, 1.0), null, true);
    }

    @Test
    public void testPreHive4243AddColumnWithFix() {
        checkEvolution("struct<_col0:int,_col1:string>", "struct<a:int,b:string,c:double>", struct(1, "foo"),
                struct(1, "foo", null), true);
    }

    @Test
    public void testPreHive4243AddColumnMiddleWithFix() {
        // Expect exception on type mismatch
        thrown.expect(SchemaEvolution.IllegalEvolutionException.class);
        checkEvolution("struct<_col0:int,_col1:double>", "struct<a:int,b:date,c:double>", struct(1, 1.0), null,
                true);
    }

    private void checkEvolution(String writerType, String readerType, Object inputRow, Object expectedOutput) {
        checkEvolution(writerType, readerType, inputRow, expectedOutput,
                (boolean) OrcConf.TOLERATE_MISSING_SCHEMA.getDefaultValue());
    }

    private void checkEvolution(String writerType, String readerType, Object inputRow, Object expectedOutput,
            boolean tolerateSchema) {
        TypeDescription readTypeDescr = TypeDescription.fromString(readerType);
        TypeDescription writerTypeDescr = TypeDescription.fromString(writerType);

        OrcStruct inputStruct = assembleStruct(writerTypeDescr, inputRow);
        OrcStruct expectedStruct = assembleStruct(readTypeDescr, expectedOutput);
        try {
            Writer writer = OrcFile.createWriter(testFilePath,
                    OrcFile.writerOptions(conf).setSchema(writerTypeDescr).stripeSize(100000).bufferSize(10000)
                            .version(OrcFile.Version.CURRENT));

            OrcMapredRecordWriter<OrcStruct> recordWriter = new OrcMapredRecordWriter<OrcStruct>(writer);
            recordWriter.write(NullWritable.get(), inputStruct);
            recordWriter.close(mock(Reporter.class));
            Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
            OrcMapredRecordReader<OrcStruct> recordReader = new OrcMapredRecordReader<>(reader,
                    reader.options().schema(readTypeDescr).tolerateMissingSchema(tolerateSchema));
            OrcStruct result = recordReader.createValue();
            recordReader.next(recordReader.createKey(), result);
            assertEquals(expectedStruct, result);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private OrcStruct assembleStruct(TypeDescription type, Object row) {
        Preconditions.checkArgument(type.getCategory() == Category.STRUCT, "Top level type must be STRUCT");

        return (OrcStruct) assembleRecord(type, row);
    }

    private WritableComparable assembleRecord(TypeDescription type, Object row) {
        if (row == null) {
            return null;
        }
        switch (type.getCategory()) {
        case STRUCT:
            OrcStruct structResult = new OrcStruct(type);
            for (int i = 0; i < structResult.getNumFields(); i++) {
                List<TypeDescription> childTypes = type.getChildren();
                structResult.setFieldValue(i, assembleRecord(childTypes.get(i), ((List<Object>) row).get(i)));
            }
            return structResult;
        case LIST:
            OrcList<WritableComparable> listResult = new OrcList<>(type);
            TypeDescription elemType = type.getChildren().get(0);
            List<Object> elems = (List<Object>) row;
            for (int i = 0; i < elems.size(); i++) {
                listResult.add(assembleRecord(elemType, elems.get(i)));
            }
            return listResult;
        case MAP:
            OrcMap<WritableComparable, WritableComparable> mapResult = new OrcMap<>(type);
            TypeDescription keyType = type.getChildren().get(0);
            TypeDescription valueType = type.getChildren().get(1);
            for (Map.Entry<Object, Object> entry : ((Map<Object, Object>) row).entrySet()) {
                mapResult.put(assembleRecord(keyType, entry.getKey()), assembleRecord(valueType, entry.getValue()));
            }
            return mapResult;
        case INT:
            return new IntWritable((Integer) row);
        case DOUBLE:
            return new DoubleWritable((Double) row);
        case STRING:
            return new Text((String) row);
        default:
            throw new UnsupportedOperationException(
                    String.format("Not expecting to have a field of type %s in unit tests", type.getCategory()));
        }
    }
}