parquet.hadoop.thrift.TestParquetToThriftReadProjection.java Source code

Java tutorial

Introduction

Here is the source code for parquet.hadoop.thrift.TestParquetToThriftReadProjection.java

Source

/**
 * Copyright 2012 Twitter, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package parquet.hadoop.thrift;

import static org.junit.Assert.assertEquals;

import java.io.ByteArrayOutputStream;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.thrift.TBase;
import org.apache.thrift.protocol.TCompactProtocol;
import org.apache.thrift.protocol.TProtocol;
import org.apache.thrift.protocol.TProtocolFactory;
import org.apache.thrift.transport.TIOStreamTransport;
import org.junit.Test;

import parquet.Log;
import parquet.hadoop.api.ReadSupport;
import parquet.hadoop.util.ContextUtil;

import com.twitter.data.proto.tutorial.thrift.AddressBook;
import com.twitter.data.proto.tutorial.thrift.Name;
import com.twitter.data.proto.tutorial.thrift.Person;
import com.twitter.data.proto.tutorial.thrift.PhoneNumber;
import parquet.thrift.test.RequiredListFixture;
import parquet.thrift.test.RequiredMapFixture;
import parquet.thrift.test.RequiredPrimitiveFixture;
import parquet.thrift.test.RequiredSetFixture;

public class TestParquetToThriftReadProjection {

    private static final Log LOG = Log.getLog(TestParquetToThriftReadProjection.class);

    @Test
    public void testThriftOptionalFieldsWithReadProjectionUsingParquetSchema() throws Exception {
        // test with projection
        Configuration conf = new Configuration();
        final String readProjectionSchema = "message AddressBook {\n" + "  optional group persons {\n"
                + "    repeated group persons_tuple {\n" + "      required group name {\n"
                + "        optional binary first_name;\n" + "        optional binary last_name;\n" + "      }\n"
                + "      optional int32 id;\n" + "    }\n" + "  }\n" + "}";
        conf.set(ReadSupport.PARQUET_READ_SCHEMA, readProjectionSchema);
        TBase toWrite = new AddressBook(Arrays.asList(new Person(new Name("Bob", "Roberts"), 0,
                "bob.roberts@example.com", Arrays.asList(new PhoneNumber("1234567890")))));

        TBase toRead = new AddressBook(Arrays.asList(new Person(new Name("Bob", "Roberts"), 0, null, null)));
        shouldDoProjection(conf, toWrite, toRead, AddressBook.class);
    }

    @Test
    public void testPullingInRequiredStructWithFilter() throws Exception {
        final String projectionFilterDesc = "persons/{id};persons/email";
        TBase toWrite = new AddressBook(Arrays.asList(new Person(new Name("Bob", "Roberts"), 0,
                "bob.roberts@example.com", Arrays.asList(new PhoneNumber("1234567890")))));

        TBase toRead = new AddressBook(
                Arrays.asList(new Person(new Name("", ""), 0, "bob.roberts@example.com", null)));
        shouldDoProjectionWithThriftColumnFilter(projectionFilterDesc, toWrite, toRead, AddressBook.class);
    }

    @Test
    public void testNotPullInOptionalFields() throws Exception {
        final String projectionFilterDesc = "nomatch";
        TBase toWrite = new AddressBook(Arrays.asList(new Person(new Name("Bob", "Roberts"), 0,
                "bob.roberts@example.com", Arrays.asList(new PhoneNumber("1234567890")))));

        TBase toRead = new AddressBook();
        shouldDoProjectionWithThriftColumnFilter(projectionFilterDesc, toWrite, toRead, AddressBook.class);
    }

    @Test
    public void testPullInRequiredMaps() throws Exception {
        String filter = "name";

        Map<String, String> mapValue = new HashMap<String, String>();
        mapValue.put("a", "1");
        mapValue.put("b", "2");
        RequiredMapFixture toWrite = new RequiredMapFixture(mapValue);
        toWrite.setName("testName");

        RequiredMapFixture toRead = new RequiredMapFixture(new HashMap<String, String>());
        toRead.setName("testName");

        shouldDoProjectionWithThriftColumnFilter(filter, toWrite, toRead, RequiredMapFixture.class);
    }

    @Test
    public void testPullInRequiredLists() throws Exception {
        String filter = "info";

        RequiredListFixture toWrite = new RequiredListFixture(
                Arrays.asList(new parquet.thrift.test.Name("first_name")));
        toWrite.setInfo("test_info");

        RequiredListFixture toRead = new RequiredListFixture(new ArrayList<parquet.thrift.test.Name>());
        toRead.setInfo("test_info");

        shouldDoProjectionWithThriftColumnFilter(filter, toWrite, toRead, RequiredListFixture.class);
    }

    @Test
    public void testPullInRequiredSets() throws Exception {
        String filter = "info";

        RequiredSetFixture toWrite = new RequiredSetFixture(
                new HashSet<parquet.thrift.test.Name>(Arrays.asList(new parquet.thrift.test.Name("first_name"))));
        toWrite.setInfo("test_info");

        RequiredSetFixture toRead = new RequiredSetFixture(new HashSet<parquet.thrift.test.Name>());
        toRead.setInfo("test_info");

        shouldDoProjectionWithThriftColumnFilter(filter, toWrite, toRead, RequiredSetFixture.class);
    }

    @Test
    public void testPullInPrimitiveValues() throws Exception {
        String filter = "info_string";

        RequiredPrimitiveFixture toWrite = new RequiredPrimitiveFixture(true, (byte) 2, (short) 3, 4, (long) 5,
                (double) 6.0, "7");
        toWrite.setInfo_string("it's info");

        RequiredPrimitiveFixture toRead = new RequiredPrimitiveFixture(false, (byte) 0, (short) 0, 0, (long) 0,
                (double) 0.0, "");
        toRead.setInfo_string("it's info");

        shouldDoProjectionWithThriftColumnFilter(filter, toWrite, toRead, RequiredPrimitiveFixture.class);
    }

    private void shouldDoProjectionWithThriftColumnFilter(String filterDesc, TBase toWrite, TBase toRead,
            Class<? extends TBase<?, ?>> thriftClass) throws Exception {
        Configuration conf = new Configuration();
        conf.set(ThriftReadSupport.THRIFT_COLUMN_FILTER_KEY, filterDesc);
        shouldDoProjection(conf, toWrite, toRead, thriftClass);
    }

    private <T extends TBase<?, ?>> void shouldDoProjection(Configuration conf, T recordToWrite,
            T exptectedReadResult, Class<? extends TBase<?, ?>> thriftClass) throws Exception {
        final Path parquetFile = new Path("target/test/TestParquetToThriftReadProjection/file.parquet");
        final FileSystem fs = parquetFile.getFileSystem(conf);
        if (fs.exists(parquetFile)) {
            fs.delete(parquetFile, true);
        }

        //create a test file
        final TProtocolFactory protocolFactory = new TCompactProtocol.Factory();
        final TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0);
        final ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(parquetFile,
                ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, thriftClass);
        final ByteArrayOutputStream baos = new ByteArrayOutputStream();
        final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos));

        recordToWrite.write(protocol);
        w.write(new BytesWritable(baos.toByteArray()));
        w.close();

        final ParquetThriftInputFormat<T> parquetThriftInputFormat = new ParquetThriftInputFormat<T>();
        final Job job = new Job(conf, "read");
        job.setInputFormatClass(ParquetThriftInputFormat.class);
        ParquetThriftInputFormat.setInputPaths(job, parquetFile);
        final JobID jobID = new JobID("local", 1);
        List<InputSplit> splits = parquetThriftInputFormat
                .getSplits(ContextUtil.newJobContext(ContextUtil.getConfiguration(job), jobID));
        T readValue = null;
        for (InputSplit split : splits) {
            TaskAttemptContext taskAttemptContext = ContextUtil.newTaskAttemptContext(
                    ContextUtil.getConfiguration(job), new TaskAttemptID(new TaskID(jobID, true, 1), 0));
            final RecordReader<Void, T> reader = parquetThriftInputFormat.createRecordReader(split,
                    taskAttemptContext);
            reader.initialize(split, taskAttemptContext);
            if (reader.nextKeyValue()) {
                readValue = reader.getCurrentValue();
                LOG.info(readValue);
            }
        }
        assertEquals(exptectedReadResult, readValue);

    }

}