cascading.tuple.hadoop.SpillableTupleHadoopTest.java Source code

Introduction

Here is the source code for cascading.tuple.hadoop.SpillableTupleHadoopTest.java
Source

/*
 * Copyright (c) 2007-2015 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cascading.tuple.hadoop;

import java.util.HashSet;
import java.util.Iterator;
import java.util.Random;
import java.util.Set;

import cascading.CascadingTestCase;
import cascading.flow.hadoop.HadoopFlowProcess;
import cascading.tuple.Tuple;
import cascading.tuple.collect.SpillableProps;
import cascading.tuple.hadoop.collect.HadoopSpillableTupleList;
import cascading.tuple.hadoop.collect.HadoopSpillableTupleMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.serializer.WritableSerialization;
import org.apache.hadoop.util.ReflectionUtils;
import org.junit.Test;

/**
 *
 */
public class SpillableTupleHadoopTest extends CascadingTestCase {
    public SpillableTupleHadoopTest() {
        super();
    }

    @Test
    public void testSpillList() {
        long time = System.currentTimeMillis();

        performListTest(5, 50, null, 0);
        performListTest(49, 50, null, 0);
        performListTest(50, 50, null, 0);
        performListTest(51, 50, null, 1);
        performListTest(499, 50, null, 9);
        performListTest(500, 50, null, 9);
        performListTest(501, 50, null, 10);

        System.out.println("time = " + (System.currentTimeMillis() - time));
    }

    @Test
    public void testSpillListCompressed() {
        GzipCodec codec = ReflectionUtils.newInstance(GzipCodec.class, new Configuration());

        long time = System.currentTimeMillis();

        performListTest(5, 50, codec, 0);
        performListTest(49, 50, codec, 0);
        performListTest(50, 50, codec, 0);
        performListTest(51, 50, codec, 1);
        performListTest(499, 50, codec, 9);
        performListTest(500, 50, codec, 9);
        performListTest(501, 50, codec, 10);

        System.out.println("time = " + (System.currentTimeMillis() - time));
    }

    private void performListTest(int size, int threshold, CompressionCodec codec, int spills) {
        Configuration jobConf = new Configuration();

        jobConf.set("io.serializations",
                TestSerialization.class.getName() + "," + WritableSerialization.class.getName()); // disable/replace WritableSerialization class
        jobConf.set("cascading.serialization.tokens",
                "1000=" + BooleanWritable.class.getName() + ",10001=" + Text.class.getName()); // not using Text, just testing parsing

        HadoopSpillableTupleList list = new HadoopSpillableTupleList(threshold, codec, jobConf);

        for (int i = 0; i < size; i++) {
            String aString = "string number " + i;
            double random = Math.random();

            list.add(new Tuple(i, aString, random, new Text(aString), new TestText(aString),
                    new Tuple("inner tuple", new BytesWritable(aString.getBytes()))));
        }

        assertEquals("not equal: list.size();", size, list.size());

        assertEquals("not equal: list.getNumFiles()", spills, list.spillCount());

        int i = -1;
        int count = 0;
        for (Tuple tuple : list) {
            int value = tuple.getInteger(0);
            assertTrue("wrong diff", value - i == 1);
            assertEquals("wrong value", "string number " + count, tuple.getObject(3).toString());
            assertEquals("wrong value", "string number " + count, tuple.getObject(4).toString());
            assertTrue("wrong type", tuple.getObject(5) instanceof Tuple);

            BytesWritable bytesWritable = (BytesWritable) ((Tuple) tuple.getObject(5)).getObject(1);
            byte[] bytes = bytesWritable.getBytes();
            String actual = new String(bytes, 0, bytesWritable.getLength());

            assertEquals("wrong value", "string number " + count, actual);

            i = value;
            count++;
        }

        assertEquals("not equal: list.size();", size, count);

        Iterator<Tuple> iterator = list.iterator();

        assertEquals("not equal: iterator.next().get(1)", "string number 0", iterator.next().getObject(1));
        assertEquals("not equal: iterator.next().get(1)", "string number 1", iterator.next().getObject(1));
    }

    @Test
    public void testSpillMap() {
        long time = System.currentTimeMillis();

        Configuration jobConf = new Configuration();

        performMapTest(5, 5, 100, 20, jobConf);
        performMapTest(5, 50, 100, 20, jobConf);
        performMapTest(50, 5, 200, 20, jobConf);
        performMapTest(500, 50, 7000, 20, jobConf);

        System.out.println("time = " + (System.currentTimeMillis() - time));
    }

    @Test
    public void testSpillMapCompressed() {
        long time = System.currentTimeMillis();

        Configuration jobConf = new Configuration();

        jobConf.set(SpillableProps.SPILL_CODECS, "org.apache.hadoop.io.compress.GzipCodec");

        performMapTest(5, 5, 100, 20, jobConf);
        performMapTest(5, 50, 100, 20, jobConf);
        performMapTest(50, 5, 200, 20, jobConf);
        performMapTest(500, 50, 7000, 20, jobConf);

        System.out.println("time = " + (System.currentTimeMillis() - time));
    }

    private void performMapTest(int numKeys, int listSize, int mapThreshold, int listThreshold,
            Configuration jobConf) {
        jobConf.set("io.serializations",
                TestSerialization.class.getName() + "," + WritableSerialization.class.getName()); // disable/replace WritableSerialization class
        jobConf.set("cascading.serialization.tokens",
                "1000=" + BooleanWritable.class.getName() + ",10001=" + Text.class.getName()); // not using Text, just testing parsing

        HadoopFlowProcess flowProcess = new HadoopFlowProcess(jobConf);
        HadoopSpillableTupleMap map = new HadoopSpillableTupleMap(SpillableProps.defaultMapInitialCapacity,
                SpillableProps.defaultMapLoadFactor, mapThreshold, listThreshold, flowProcess);

        Set<Integer> keySet = new HashSet<Integer>();
        Random gen = new Random(1);

        for (int i = 0; i < listSize * numKeys; i++) {
            String aString = "string number " + i;
            double random = Math.random();

            double keys = numKeys / 3.0;
            int key = (int) (gen.nextDouble() * keys + gen.nextDouble() * keys + gen.nextDouble() * keys);

            Tuple tuple = new Tuple(i, aString, random, new Text(aString), new TestText(aString),
                    new Tuple("inner tuple", new BytesWritable(aString.getBytes())));

            map.get(new Tuple(key)).add(tuple);

            keySet.add(key);
        }

        // the list test above verifies the contents are being serialized, the Map is just a container of lists.
        assertEquals("not equal: map.size();", keySet.size(), map.size());
    }
}