parquet.scrooge.ParquetScroogeSchemeTest.java Source code

Java tutorial

Introduction

Here is the source code for parquet.scrooge.ParquetScroogeSchemeTest.java

Source

/**
 * Copyright 2012 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package parquet.scrooge;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapreduce.*;
import org.apache.thrift.TBase;
import org.apache.thrift.protocol.TCompactProtocol;
import org.apache.thrift.protocol.TProtocol;
import org.apache.thrift.protocol.TProtocolFactory;
import org.apache.thrift.transport.TIOStreamTransport;
import org.junit.Test;
import parquet.hadoop.ParquetInputFormat;
import parquet.hadoop.thrift.ParquetThriftInputFormat;
import parquet.hadoop.thrift.ThriftReadSupport;
import parquet.hadoop.thrift.ThriftToParquetFileWriter;
import parquet.hadoop.util.ContextUtil;
import parquet.scrooge.test.TestPersonWithAllInformation;
import parquet.thrift.test.Address;
import parquet.thrift.test.Phone;
import parquet.thrift.test.RequiredPrimitiveFixture;

import java.io.ByteArrayOutputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.junit.Assert.assertEquals;

/**
 * Write data in thrift, read in scrooge
 *
 * @author Tianshuo Deng
 */
public class ParquetScroogeSchemeTest {
    @Test
    public void testWritePrimitveThriftReadScrooge() throws Exception {
        RequiredPrimitiveFixture toWrite = new RequiredPrimitiveFixture(true, (byte) 2, (short) 3, 4, (long) 5,
                (double) 6.0, "7");
        toWrite.setInfo_string("it's info");
        verifyScroogeRead(toWrite, parquet.scrooge.test.RequiredPrimitiveFixture.class,
                "RequiredPrimitiveFixture(true,2,3,4,5,6.0,7,Some(it's info))", "**");
    }

    @Test
    public void testNestedReadingInScrooge() throws Exception {
        Map<String, parquet.thrift.test.Phone> phoneMap = new HashMap<String, Phone>();
        phoneMap.put("key1", new parquet.thrift.test.Phone("111", "222"));
        parquet.thrift.test.TestPersonWithAllInformation toWrite = new parquet.thrift.test.TestPersonWithAllInformation(
                new parquet.thrift.test.Name("first"), new Address("my_street", "my_zip"), phoneMap);
        toWrite.setInfo("my_info");
        String expected = "TestPersonWithAllInformation(Name(first,None),None,Address(my_street,my_zip),None,Some(my_info),Map(key1 -> Phone(111,222)),None,None)";
        verifyScroogeRead(toWrite, TestPersonWithAllInformation.class, expected, "**");
        String expectedProjected = "TestPersonWithAllInformation(Name(first,None),None,Address(my_street,my_zip),None,Some(my_info),Map(),None,None)";
        verifyScroogeRead(toWrite, TestPersonWithAllInformation.class, expectedProjected,
                "address/*;info;name/first_name");
    }

    public <T> void verifyScroogeRead(TBase recordToWrite, Class<T> readClass, String expectedStr,
            String projectionFilter) throws Exception {
        Configuration conf = new Configuration();
        conf.set("parquet.thrift.converter.class", ScroogeRecordConverter.class.getName());
        conf.set(ThriftReadSupport.THRIFT_READ_CLASS_KEY, readClass.getName());
        conf.set(ThriftReadSupport.THRIFT_COLUMN_FILTER_KEY, projectionFilter);

        final Path parquetFile = new Path("target/test/TestParquetToThriftReadProjection/file.parquet");
        final FileSystem fs = parquetFile.getFileSystem(conf);
        if (fs.exists(parquetFile)) {
            fs.delete(parquetFile, true);
        }

        //create a test file
        final TProtocolFactory protocolFactory = new TCompactProtocol.Factory();
        final TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0);
        Class writeClass = recordToWrite.getClass();
        final ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(parquetFile,
                ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, writeClass);
        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
        final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos));

        recordToWrite.write(protocol);
        w.write(new BytesWritable(baos.toByteArray()));
        w.close();

        final ParquetScroogeInputFormat<T> parquetScroogeInputFormat = new ParquetScroogeInputFormat<T>();
        final Job job = new Job(conf, "read");
        job.setInputFormatClass(ParquetThriftInputFormat.class);
        ParquetThriftInputFormat.setInputPaths(job, parquetFile);
        final JobID jobID = new JobID("local", 1);
        List<InputSplit> splits = parquetScroogeInputFormat
                .getSplits(new JobContext(ContextUtil.getConfiguration(job), jobID));
        T readValue = null;
        for (InputSplit split : splits) {
            TaskAttemptContext taskAttemptContext = new TaskAttemptContext(ContextUtil.getConfiguration(job),
                    new TaskAttemptID(new TaskID(jobID, true, 1), 0));
            final RecordReader<Void, T> reader = parquetScroogeInputFormat.createRecordReader(split,
                    taskAttemptContext);
            reader.initialize(split, taskAttemptContext);
            if (reader.nextKeyValue()) {
                readValue = reader.getCurrentValue();
            }
        }
        assertEquals(expectedStr, readValue.toString());
    }

}