org.apache.orc.mapreduce.TestMrUnit.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.orc.mapreduce.TestMrUnit.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.orc.mapreduce;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.Serialization;
import org.apache.hadoop.io.serializer.Serializer;
import org.apache.hadoop.io.serializer.WritableSerialization;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mrunit.mapreduce.MapReduceDriver;
import org.apache.orc.OrcConf;
import org.apache.orc.TypeDescription;
import org.apache.orc.mapred.OrcKey;
import org.apache.orc.mapred.OrcStruct;
import org.apache.orc.mapred.OrcValue;
import org.junit.Test;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Iterator;

public class TestMrUnit {
    JobConf conf = new JobConf();

    /**
     * Split the input struct into its two parts.
     */
    public static class MyMapper extends Mapper<NullWritable, OrcStruct, OrcKey, OrcValue> {
        private OrcKey keyWrapper = new OrcKey();
        private OrcValue valueWrapper = new OrcValue();

        @Override
        protected void map(NullWritable key, OrcStruct value, Context context)
                throws IOException, InterruptedException {
            keyWrapper.key = value.getFieldValue(0);
            valueWrapper.value = value.getFieldValue(1);
            context.write(keyWrapper, valueWrapper);
        }
    }

    /**
     * Glue the key and values back together.
     */
    public static class MyReducer extends Reducer<OrcKey, OrcValue, NullWritable, OrcStruct> {
        private OrcStruct output = new OrcStruct(
                TypeDescription.fromString("struct<first:struct<x:int,y:int>,second:struct<z:string>>"));
        private final NullWritable nada = NullWritable.get();

        @Override
        protected void reduce(OrcKey key, Iterable<OrcValue> values, Context context)
                throws IOException, InterruptedException {
            output.setFieldValue(0, key.key);
            for (OrcValue value : values) {
                output.setFieldValue(1, value.value);
                context.write(nada, output);
            }
        }
    }

    /**
     * This class is intended to support MRUnit's object copying for input and
     * output objects.
     *
     * Real mapreduce contexts should NEVER use this class.
     *
     * The type string is serialized before each value.
     */
    public static class OrcStructSerialization implements Serialization<OrcStruct> {

        @Override
        public boolean accept(Class<?> cls) {
            return OrcStruct.class.isAssignableFrom(cls);
        }

        @Override
        public Serializer<OrcStruct> getSerializer(Class<OrcStruct> aClass) {
            return new Serializer<OrcStruct>() {
                DataOutputStream dataOut;

                public void open(OutputStream out) {
                    if (out instanceof DataOutputStream) {
                        dataOut = (DataOutputStream) out;
                    } else {
                        dataOut = new DataOutputStream(out);
                    }
                }

                public void serialize(OrcStruct w) throws IOException {
                    Text.writeString(dataOut, w.getSchema().toString());
                    w.write(dataOut);
                }

                public void close() throws IOException {
                    dataOut.close();
                }
            };
        }

        @Override
        public Deserializer<OrcStruct> getDeserializer(Class<OrcStruct> aClass) {
            return new Deserializer<OrcStruct>() {
                DataInputStream input;

                @Override
                public void open(InputStream inputStream) throws IOException {
                    if (inputStream instanceof DataInputStream) {
                        input = (DataInputStream) inputStream;
                    } else {
                        input = new DataInputStream(inputStream);
                    }
                }

                @Override
                public OrcStruct deserialize(OrcStruct orcStruct) throws IOException {
                    String typeStr = Text.readString(input);
                    OrcStruct result = new OrcStruct(TypeDescription.fromString(typeStr));
                    result.readFields(input);
                    return result;
                }

                @Override
                public void close() throws IOException {
                    // PASS
                }
            };
        }
    }

    @Test
    public void testMapred() throws IOException {
        conf.set("io.serializations",
                OrcStructSerialization.class.getName() + "," + WritableSerialization.class.getName());
        OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.setString(conf, "struct<x:int,y:int>");
        OrcConf.MAPRED_SHUFFLE_VALUE_SCHEMA.setString(conf, "struct<z:string>");
        MyMapper mapper = new MyMapper();
        MyReducer reducer = new MyReducer();
        MapReduceDriver<NullWritable, OrcStruct, OrcKey, OrcValue, NullWritable, OrcStruct> driver = new MapReduceDriver<>(
                mapper, reducer);
        driver.setConfiguration(conf);
        NullWritable nada = NullWritable.get();
        OrcStruct input = (OrcStruct) OrcStruct
                .createValue(TypeDescription.fromString("struct<one:struct<x:int,y:int>,two:struct<z:string>>"));
        IntWritable x = (IntWritable) ((OrcStruct) input.getFieldValue(0)).getFieldValue(0);
        IntWritable y = (IntWritable) ((OrcStruct) input.getFieldValue(0)).getFieldValue(1);
        Text z = (Text) ((OrcStruct) input.getFieldValue(1)).getFieldValue(0);

        // generate the input stream
        for (int r = 0; r < 20; ++r) {
            x.set(100 - (r / 4));
            y.set(r * 2);
            z.set(Integer.toHexString(r));
            driver.withInput(nada, input);
        }

        // generate the expected outputs
        for (int g = 4; g >= 0; --g) {
            x.set(100 - g);
            for (int i = 0; i < 4; ++i) {
                int r = g * 4 + i;
                y.set(r * 2);
                z.set(Integer.toHexString(r));
                driver.withOutput(nada, input);
            }
        }
        driver.runTest();
    }
}