org.springframework.data.hadoop.serialization.HdfsWriterTest.java Source code

Java tutorial

Introduction

Here is the source code for org.springframework.data.hadoop.serialization.HdfsWriterTest.java

Source

/*
 * Copyright 2013 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.springframework.data.hadoop.serialization;

import static org.apache.hadoop.io.IOUtils.closeStream;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.DefaultCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.util.ReflectionUtils;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.data.hadoop.fs.HdfsResourceLoader;
import org.springframework.data.hadoop.serialization.SerializationFormatOperations.SerializationWriterCallback;
import org.springframework.expression.AccessException;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import org.springframework.util.ClassUtils;

/**
 * Integration test for {@link SerializationFormat} testing simple and compressed writes of a file and objects to HDFS.
 * 
 * @author Alex Savov
 */
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration
public class HdfsWriterTest {

    @Autowired
    private HdfsResourceLoader hdfsResourceLoader;

    @Autowired
    private Configuration configuration;

    /* The file that's written to HDFS with[out] compression. */
    @Value("classpath:/data/apache-short.txt")
    private Resource sourceResource;

    /* All output files are written to that HDFS dir. */
    @Value("${hdfs.writer.output.dir}")
    private String hdfsOutputDir;

    private SerializationWriterObjectFactory sfObjectFactory;

    private ResourceSerializationFormat RESOURCE_FORMAT;

    private MultiResourceSerializationFormat MULTI_RESOURCE_FORMAT;
    private MultiResourceSerializationFormat2 MULTI_RESOURCE_FORMAT2;

    private AvroFormat<PojoSerializable> AVRO;

    private SequenceFileFormat<PojoSerializable> SEQUENCE_FILE_JAVA;

    private SequenceFileFormat<PojoWritable> SEQUENCE_FILE_WRITABLE;

    private AvroSequenceFileFormat<PojoSerializable> SEQUENCE_FILE_AVRO;

    @Before
    public void setUp() throws Exception {

        RESOURCE_FORMAT = new ResourceSerializationFormat();
        RESOURCE_FORMAT.setConfiguration(configuration);
        RESOURCE_FORMAT.afterPropertiesSet();

        MULTI_RESOURCE_FORMAT = new MultiResourceSerializationFormat();
        MULTI_RESOURCE_FORMAT.setHdfsResourceLoader(hdfsResourceLoader);

        MULTI_RESOURCE_FORMAT2 = new MultiResourceSerializationFormat2();
        MULTI_RESOURCE_FORMAT2.setHdfsResourceLoader(hdfsResourceLoader);
        MULTI_RESOURCE_FORMAT2.afterPropertiesSet();

        // TODO: If we decide to clone Configuration for every SerializationFormat then this code is required.
        /*
         * Collection<String> serializations = hdfs.configuration.getStringCollection(HADOOP_IO_SERIALIZATIONS);
         * 
         * Class<?>[] sClasses = { WritableSerialization.class, JavaSerialization.class, AvroSerialization.class };
         * 
         * for (Class<?> serializationClass : sClasses) {
         * 
         * if (!serializations.contains(serializationClass.getName())) {
         * 
         * serializations.add(serializationClass.getName()); } }
         * 
         * hdfs.configuration.setStrings(HADOOP_IO_SERIALIZATIONS, serializations.toArray(new
         * String[serializations.size()]));
         */

        // NOTE: So far we share the same Configuration between SerializationFormats.

        sfObjectFactory = new SerializationWriterObjectFactory(hdfsResourceLoader);

        SEQUENCE_FILE_WRITABLE = new SequenceFileFormat<PojoWritable>(PojoWritable.class);
        SEQUENCE_FILE_WRITABLE.setConfiguration(configuration);
        SEQUENCE_FILE_WRITABLE.afterPropertiesSet();

        SEQUENCE_FILE_JAVA = new SequenceFileFormat<PojoSerializable>(PojoSerializable.class);
        SEQUENCE_FILE_JAVA.setConfiguration(configuration);
        SEQUENCE_FILE_JAVA.afterPropertiesSet();

        SEQUENCE_FILE_AVRO = new AvroSequenceFileFormat<PojoSerializable>(PojoSerializable.class);
        SEQUENCE_FILE_AVRO.setConfiguration(configuration);
        SEQUENCE_FILE_AVRO.afterPropertiesSet();

        AVRO = new AvroFormat<PojoSerializable>(PojoSerializable.class);
    }

    /*
     * Test write from source file to HDFS destination.
     */
    @Test
    public void testWriteOfResource() throws Exception {

        testResourceWrite(RESOURCE_FORMAT, 1, /* no compression */null, /* doesnt matter */false);
    }

    /*
     * Test write from multiple source files (in our case the same one) to single HDFS destination.
     */
    @Test
    public void testWriteOfMultipleResources() throws Exception {
        testResourceWrite(RESOURCE_FORMAT, 5, /* no compression */null, /* doesnt matter */false);
    }

    /*
     * Test write from multiple source files (in our case the same one) to single Avro container.
     */
    @SuppressWarnings("unchecked")
    @Test
    public void testWriteOfMultipleResourcesToAvroContainer() throws Exception {

        for (SerializationFormatSupport<Resource> resourceFormat : new SerializationFormatSupport[] {
                MULTI_RESOURCE_FORMAT, MULTI_RESOURCE_FORMAT2 }) {

            int numberOfResources = 5;

            String avroContainer = testResourceWrite(resourceFormat, numberOfResources, /* no compression */null,
                    /* doesnt matter */false);

            SerializationReader<Resource> reader = resourceFormat.getReader(avroContainer);

            for (Resource resource; (resource = reader.read()) != null; numberOfResources--) {

                InputStream originalIS = sourceResource.getInputStream();
                InputStream readIS = resource.getInputStream();

                assertEquals(sourceResource.getDescription(), resource.getDescription());

                try {
                    assertTrue(IOUtils.contentEquals(originalIS, readIS));
                } finally {
                    IOUtils.closeQuietly(originalIS);
                    IOUtils.closeQuietly(readIS);
                }
            }

            assertEquals(0, numberOfResources);
        }
    }

    /**
     * Test write of pojos collection using Writable serialization.
     */
    @Test
    public void testWriteOfWritableToSeqFile() throws Exception {

        testSerializationWrite(PojoWritable.class, SEQUENCE_FILE_WRITABLE, /* compress */false);
    }

    /**
     * Test compression write of pojos collection using Writable serialization.
     */
    @Test
    public void testCompressedWriteOfWritableToSeqFile() throws Exception {

        testSerializationWrite(PojoWritable.class, SEQUENCE_FILE_WRITABLE, /* compress */true);
    }

    /**
     * Test write of pojos collection using Java serialization.
     */
    @Test
    public void testWriteOfSerializableToSeqFile() throws Exception {

        testSerializationWrite(PojoSerializable.class, SEQUENCE_FILE_JAVA, /* compress */false);
    }

    /**
     * Test compressed write of pojos collection using Java serialization.
     */
    @Test
    public void testCompressedWriteOfSerializableToSeqFile() throws Exception {

        testSerializationWrite(PojoSerializable.class, SEQUENCE_FILE_JAVA, /* compress */true);
    }

    /**
     * Test write of pojos collection using Avro serialization.
     */
    @Test
    public void testWriteOfAvroToSeqFile() throws Exception {

        SEQUENCE_FILE_AVRO.setSerializationKeyProvider(new SerializationKeyProvider() {

            public Void getKey(Object object) {
                return (Void) null;
            }

            public Class<Void> getKeyClass(Class<?> objectClass) {
                return Void.class;
            }
        });

        testSerializationWrite(PojoSerializable.class, SEQUENCE_FILE_AVRO, /* compress */false);
    }

    /**
     * Test compressed write of pojos collection using Avro serialization.
     */
    @Test
    public void testCompressedWriteOfAvroToSeqFile() throws Exception {

        testSerializationWrite(PojoSerializable.class, SEQUENCE_FILE_AVRO, /* compress */true);
    }

    /**
     * Test write of pojos collection using Avro serialization.
     */
    @Test
    public void testWriteOfPojoToAvroFile() throws Exception {

        testSerializationWrite(PojoSerializable.class, AVRO, /* compress */false);
    }

    /**
     * Test compression write of pojos collection using Avro serialization.
     */
    @Test
    public void testCompressedtestWriteOfPojoToAvroFile() throws Exception {

        testSerializationWrite(PojoSerializable.class, AVRO, /* compress */true);
    }

    /**
     * Test compressed write from source file to HDFS destination using codec alias as configured within Hadoop.
     */
    @Test
    public void testCompressedWriteUsingHadoopCodecAlias() throws Exception {

        // DefaultCodec is configured by Hadoop by default
        final CompressionCodec codec = new CompressionCodecFactory(configuration)
                .getCodecByName(DefaultCodec.class.getSimpleName());

        testResourceWrite(RESOURCE_FORMAT, 1, codec, /* useCodecAlias */true);
    }

    /**
     * Test compressed write from source file to HDFS destination using codec class name as configured within Hadoop.
     */
    @Test
    public void testCompressedWriteUsingHadoopCodecClassName() throws Exception {

        // GzipCodec is configured by Hadoop by default
        final CompressionCodec codec = new CompressionCodecFactory(configuration)
                .getCodecByName(GzipCodec.class.getSimpleName());

        testResourceWrite(RESOURCE_FORMAT, 1, codec, /* useCodecAlias */false);
    }

    /**
     * Test compressed write from source file to HDFS destination using user provided codec loaded from the classpath.
     */
    @Test
    public void testCompressedWriteUsingUserCodecClassName() throws Exception {

        // CustomCompressionCodec is NOT supported by Hadoop, but is provided by the
        // client on the classpath
        final CompressionCodec codec = new CustomCompressionCodec();

        testResourceWrite(RESOURCE_FORMAT, 1, codec, /* useCodecAlias */false);
    }

    /**
     * Test compressed write of source file against ALL codecs supported by Hadoop.
     */
    @Test
    public void testCompressedWriteUsingHadoopCodecs() {
        // Might be re-worked to support parameterized tests.
        // See @Parameterized and Parameterized.Parameters

        hdfsOutputDir += "hadoop-codecs/";

        final StringBuilder exceptions = new StringBuilder();

        // Get a list of all codecs supported by Hadoop
        for (Class<? extends CompressionCodec> codecClass : CompressionCodecFactory
                .getCodecClasses(configuration)) {
            try {
                testResourceWrite(RESOURCE_FORMAT, 1,
                        ReflectionUtils.newInstance(codecClass, configuration), /* useCodecAlias */
                        true);
            } catch (Exception exc) {
                exceptions.append(codecClass.getName() + " not supported. Details: " + exc.getMessage() + "\n");
            }
        }

        assertTrue(exceptions.toString(), exceptions.length() == 0);
    }

    /**
     * Test {@link ReflectiveSerializationKeyProvider}.
     */
    @Test
    public void testReflectiveSerializationKeyProvider() throws AccessException {

        // Test field key
        {
            SerializationKeyProvider keyProvider = new ReflectiveSerializationKeyProvider(PojoSerializable.class,
                    "id");

            assertSame(Integer.class, keyProvider.getKeyClass(PojoSerializable.class));

            PojoSerializable pojo = new PojoSerializable();

            assertEquals((Integer) pojo.id, keyProvider.getKey(pojo));
        }

        // Test getter key
        {
            SerializationKeyProvider keyProvider = new ReflectiveSerializationKeyProvider(PojoSerializable.class,
                    "name");

            assertSame(String.class, keyProvider.getKeyClass(PojoSerializable.class));

            PojoSerializable pojo = new PojoSerializable();

            assertEquals(pojo.getName(), keyProvider.getKey(pojo));
        }
    }

    /**
     * Test core Resource [compressed] write logic.
     * 
     * @param codec Used ONLY to get codec extension and its class name or alias in a type-safe manner.
     * @param useAlias If <code>true</code> uses <code>codec.getClass().getSimpleName()</code> as a codec alias.
     * Otherwise uses <code>codec.getClass().getName()</code> as a codec class name.
     */
    private String testResourceWrite(SerializationFormatSupport<Resource> resourceFormat, int resourceCopies,
            CompressionCodec codec, boolean useAlias) throws Exception {

        if (codec != null) {
            // configure compression
            resourceFormat
                    .setCompressionAlias(useAlias ? codec.getClass().getSimpleName() : codec.getClass().getName());
        }

        // calculates the destination for Resource source.
        String destination;
        {
            destination = hdfsOutputDir;

            // add file name
            destination += sourceResource.getFilename();
            // add files count
            destination += "_" + resourceCopies;
            // add serialization format name
            destination += "_" + resourceFormat.getClass().getSimpleName();
        }

        hdfsWrite(resourceFormat, Collections.nCopies(resourceCopies, sourceResource), destination);

        // expected destination on hdfs should have codec extension appended
        assertHdfsFileExists(destination + resourceFormat.getExtension());

        return destination;
    }

    /**
     * Test core write-of-objects logic.
     */
    private <T> void testSerializationWrite(Class<T> objectClass, SerializationFormatSupport<T> serializationFormat,
            boolean compress) throws Exception {

        List<T> objects = createPojoList(objectClass, 5000);

        String destination;
        {
            destination = hdfsOutputDir;

            // add class name
            destination += objectClass.getSimpleName();
            // add objects count
            destination += "_" + objects.size();
            // add serialization format name
            destination += "_" + serializationFormat.getClass().getSimpleName();
            // add compression flag
            destination += (compress ? "_compressed" : "");
            // add serialization format extension
            destination += serializationFormat.getExtension();
        }

        if (compress) {
            // Use default Hadoop compression (via its alias) also supported by Avro!
            serializationFormat.setCompressionAlias("deflate");
        }

        hdfsWrite(serializationFormat, objects, destination);

        assertHdfsFileExists(destination);

        List<T> readObjects = new ArrayList<T>();
        {
            // We do need that while reading (as opposite to writing)!
            serializationFormat.setHdfsResourceLoader(hdfsResourceLoader);

            SerializationReader<T> reader = serializationFormat.getReader(destination);

            for (T readObject = reader.read(); readObject != null; readObject = reader.read()) {
                readObjects.add(readObject);
            }

            closeStream(reader);
        }

        assertEquals(objects, readObjects);
    }

    private <T> void hdfsWrite(SerializationFormat<T> serializationCreator, final Iterable<T> sources,
            String destination) throws Exception {

        // Delegate to core SerializationFormat logic.
        sfObjectFactory.setSerializationFormat(serializationCreator);

        new SerializationFormatTemplate(sfObjectFactory).write(destination, new SerializationWriterCallback<T>() {
            @Override
            public void doInSerializationFormat(SerializationWriter<T> serializationFormat) throws IOException {
                for (T source : sources) {
                    serializationFormat.write(source);
                }
            }
        });
    }

    private void assertHdfsFileExists(String hdfsFile) {
        assertTrue("'" + hdfsFile + "' file is not present on HDFS.",
                hdfsResourceLoader.getResource(hdfsFile).exists());
    }

    public static class CustomCompressionCodec extends DefaultCodec {

        @Override
        public String getDefaultExtension() {
            return ".cusTom";
        }
    }

    public static <T> List<T> createPojoList(Class<T> objectClass, int size) throws Exception {

        List<T> objects = new ArrayList<T>();

        for (int i = 0; i < size; i++) {
            objects.add(objectClass.newInstance());
        }

        return objects;
    }

    public static class PojoSerializable implements Serializable {

        private static final long serialVersionUID = 4225081912489347353L;

        private static int COUNTER = 0;

        protected int id = COUNTER++;

        private String name = "[" + id + "]";

        private String description = "...here goes Pojo's description :)";

        private List<Integer> relations = new ArrayList<Integer>();
        {
            relations.add(id);
        }

        public String getName() {
            return name;
        }

        public void setName(String name) {
            this.name = name;
        }

        public String getDescription() {
            return description;
        }

        public void setDescription(String description) {
            this.description = description;
        }

        public List<Integer> getRelations() {
            return relations;
        }

        public void setRelations(List<Integer> relations) {
            this.relations = relations;
        }

        @Override
        public String toString() {
            return getClass().getSimpleName() + ":" + name;
        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + id;
            result = prime * result + ((name == null) ? 0 : name.hashCode());
            result = prime * result + ((relations == null) ? 0 : relations.hashCode());
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj)
                return true;
            if (obj == null)
                return false;
            if (getClass() != obj.getClass())
                return false;
            PojoSerializable other = (PojoSerializable) obj;
            if (id != other.id)
                return false;
            if (name == null) {
                if (other.name != null)
                    return false;
            } else if (!name.equals(other.name))
                return false;
            if (relations == null) {
                if (other.relations != null)
                    return false;
            } else if (!relations.equals(other.relations))
                return false;
            return true;
        }
    }

    public static class PojoWritable extends PojoSerializable implements Writable {

        private static final long serialVersionUID = -1196188141912933846L;

        public void write(DataOutput out) throws IOException {
            out.writeInt(id);
            out.writeUTF(getName());
            out.writeUTF(getDescription());

            {
                out.writeUTF(getRelations().getClass().getName());
                out.writeInt(getRelations().size());

                for (Integer relation : getRelations()) {
                    out.writeInt(relation);
                }
            }
        }

        public void readFields(DataInput in) throws IOException {
            id = in.readInt();
            setName(in.readUTF());
            setDescription(in.readUTF());

            try {
                String listClassName = in.readUTF();

                Class<?> listClass = ClassUtils.resolveClassName(listClassName, getClass().getClassLoader());

                @SuppressWarnings("unchecked")
                List<Integer> relations = (List<Integer>) listClass.newInstance();

                for (int i = in.readInt(); i > 0; i--) {
                    relations.add(in.readInt());
                }

                setRelations(relations);

            } catch (Exception e) {
                throw new RuntimeException(e.getMessage(), e);
            }
        }
    }

}