org.kitesdk.data.filesystem.DatasetTestUtilities.java Source code

Java tutorial

Introduction

Here is the source code for org.kitesdk.data.filesystem.DatasetTestUtilities.java

Source

/**
 * Copyright 2013 Cloudera Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.kitesdk.data.filesystem;

import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetReader;
import org.kitesdk.data.DatasetWriter;
import org.kitesdk.data.PartitionKey;
import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.google.common.io.Resources;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.NoSuchElementException;
import java.util.Set;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecordBuilder;
import org.junit.Assert;

public class DatasetTestUtilities {

    public final static Schema STRING_SCHEMA = loadSchema("schema/string.avsc");
    public final static Schema USER_SCHEMA = loadSchema("schema/user.avsc");
    public final static URI USER_SCHEMA_URL = findSchemaURI("schema/user.avsc");

    private static Schema loadSchema(String resource) {
        try {
            return new Schema.Parser().parse(Resources.getResource(resource).openStream());
        } catch (IOException e) {
            throw new IllegalStateException("Cannot load " + resource);
        }
    }

    private static URI findSchemaURI(String resource) {
        try {
            return Resources.getResource(resource).toURI();
        } catch (URISyntaxException e) {
            throw new IllegalStateException("Cannot load " + resource);
        }
    }

    public static void writeTestUsers(Dataset<GenericData.Record> ds, int count) {
        writeTestUsers(ds, count, 0);
    }

    public static void writeTestUsers(Dataset<GenericData.Record> ds, int count, int start) {
        writeTestUsers(ds, count, start, "email");
    }

    public static void writeTestUsers(Dataset<GenericData.Record> ds, int count, int start, String... fields) {
        DatasetWriter<GenericData.Record> writer = null;
        try {
            writer = ds.newWriter();
            writer.open();
            for (int i = start; i < count + start; i++) {
                GenericRecordBuilder recordBuilder = new GenericRecordBuilder(ds.getDescriptor().getSchema())
                        .set("username", "test-" + i);
                for (String field : fields) {
                    recordBuilder.set(field, field + "-" + i);
                }
                writer.write(recordBuilder.build());
            }
            writer.flush();
        } finally {
            if (writer != null) {
                writer.close();
            }
        }
    }

    public static void checkTestUsers(Dataset<GenericData.Record> ds, int count) {
        checkTestUsers(ds, count, "email");
    }

    public static void checkTestUsers(Dataset<GenericData.Record> ds, int count, int start) {
        checkTestUsers(ds, count, start, "email");
    }

    public static void checkTestUsers(Dataset<GenericData.Record> ds, int count, final String... fields) {
        checkTestUsers(ds, count, 0, fields);
    }

    public static void checkTestUsers(Dataset<GenericData.Record> ds, int count, int start,
            final String... fields) {
        final Set<String> usernames = Sets.newHashSet();
        for (int i = start; i < count + start; i++) {
            usernames.add("test-" + i);
        }

        checkReaderBehavior(ds.newReader(), count, new RecordValidator<GenericData.Record>() {
            @Override
            public void validate(GenericData.Record record, int recordNum) {
                Assert.assertTrue(usernames.remove((String) record.get("username")));
                for (String field : fields) {
                    Assert.assertNotNull(record.get(field));
                }
            }
        });

        Assert.assertTrue(usernames.isEmpty());
    }

    public static void checkTestUsers(Set<GenericData.Record> records, int count) {
        Assert.assertEquals("Wrong number of records", count, records.size());
        // record order is not guaranteed, so check that we have read all the
        // records
        Set<String> usernames = Sets.newHashSet();
        for (int i = 0; i < count; i++) {
            usernames.add("test-" + i);
        }
        for (GenericData.Record actualRecord : records) {
            Assert.assertTrue(usernames.remove((String) actualRecord.get("username")));
            Assert.assertNotNull(actualRecord.get("email"));
        }
        Assert.assertTrue(usernames.isEmpty());
    }

    public static <E> Set<E> materialize(Dataset<E> ds) {
        Set<E> records = Sets.newHashSet();
        DatasetReader<E> reader = null;
        try {
            reader = ds.newReader();
            reader.open();
            for (E record : reader) {
                records.add(record);
            }
        } finally {
            if (reader != null) {
                reader.close();
            }
        }
        return records;
    }

    public static <E> int datasetSize(Dataset<E> ds) {
        return materialize(ds).size();
    }

    @SuppressWarnings("deprecation")
    public static <E> void testPartitionKeysAreEqual(Dataset<E> ds, PartitionKey... expectedKeys) {
        Set<PartitionKey> expected = Sets.newHashSet(expectedKeys);
        Set<PartitionKey> actual = Sets
                .newHashSet(Iterables.transform(ds.getPartitions(), new Function<Dataset, PartitionKey>() {
                    @Override
                    public PartitionKey apply(Dataset input) {
                        return ((FileSystemDataset) input).getPartitionKey();
                    }
                }));
        Assert.assertEquals(expected, actual);
    }

    public static interface RecordValidator<R> {
        void validate(R record, int recordNum);
    }

    public static <R> void checkReaderBehavior(DatasetReader<R> reader, int totalRecords,
            RecordValidator<R> validator) {
        Assert.assertFalse("Reader is open before open()", reader.isOpen());

        try {
            reader.open();

            Assert.assertTrue("Reader is not open after open()", reader.isOpen());

            checkReaderIteration(reader, totalRecords, validator);

        } finally {
            reader.close();
        }

        Assert.assertFalse("Reader is open after close()", reader.isOpen());
    }

    public static <R> void checkReaderIteration(DatasetReader<R> reader, int expectedRecordCount,
            RecordValidator<R> validator) {
        int recordCount = 0;

        Assert.assertTrue("Reader is not open", reader.isOpen());
        Assert.assertTrue("Reader has no records, expected " + expectedRecordCount,
                (expectedRecordCount == 0) || reader.hasNext());

        for (R record : reader) {
            // add calls to hasNext, which should not affect the iteration
            reader.hasNext();
            Assert.assertNotNull(record);
            validator.validate(record, recordCount);
            recordCount++;
        }

        Assert.assertFalse("Reader is empty, but hasNext is true", reader.hasNext());

        // verify that NoSuchElementException is thrown when hasNext returns false
        try {
            reader.next();
            Assert.fail("Reader did not throw NoSuchElementException");
        } catch (NoSuchElementException ex) {
            // this is the correct behavior
        }

        Assert.assertTrue("Reader is empty, but should be open", reader.isOpen());

        // verify the correct number of records were produced
        // if hasNext advances the reader, then this will be wrong
        Assert.assertEquals("Incorrect number of records", expectedRecordCount, recordCount);
    }

}