Java tutorial
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.parquet.hadoop.thrift; import com.twitter.data.proto.tutorial.thrift.AddressBook; import com.twitter.data.proto.tutorial.thrift.Name; import com.twitter.data.proto.tutorial.thrift.Person; import com.twitter.data.proto.tutorial.thrift.PhoneNumber; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.mapreduce.*; import org.apache.parquet.thrift.test.compat.MapWithPrimMapValue; import org.apache.parquet.thrift.test.compat.MapWithStructMapValue; import org.apache.parquet.thrift.test.compat.MapWithStructValue; import org.apache.parquet.thrift.test.compat.StructV3; import org.apache.parquet.thrift.test.compat.StructV4WithExtracStructField; import org.apache.thrift.TBase; import org.apache.thrift.protocol.TCompactProtocol; import org.apache.thrift.protocol.TProtocol; import org.apache.thrift.protocol.TProtocolFactory; import org.apache.thrift.transport.TIOStreamTransport; import org.junit.Test; import org.apache.parquet.Log; import org.apache.parquet.hadoop.api.ReadSupport; import org.apache.parquet.hadoop.util.ContextUtil; import org.apache.parquet.thrift.test.*; import java.io.ByteArrayOutputStream; import java.util.*; import static org.junit.Assert.assertEquals; public class TestParquetToThriftReadWriteAndProjection { private static final Log LOG = Log.getLog(TestParquetToThriftReadWriteAndProjection.class); @Test public void testThriftOptionalFieldsWithReadProjectionUsingParquetSchema() throws Exception { // test with projection Configuration conf = new Configuration(); final String readProjectionSchema = "message AddressBook {\n" + " optional group persons {\n" + " repeated group persons_tuple {\n" + " required group name {\n" + " optional binary first_name;\n" + " optional binary last_name;\n" + " }\n" + " optional int32 id;\n" + " }\n" + " }\n" + "}"; conf.set(ReadSupport.PARQUET_READ_SCHEMA, readProjectionSchema); TBase toWrite = new AddressBook(Arrays.asList(new Person(new Name("Bob", "Roberts"), 0, "bob.roberts@example.com", Arrays.asList(new PhoneNumber("1234567890"))))); TBase toRead = new AddressBook(Arrays.asList(new Person(new Name("Bob", "Roberts"), 0, null, null))); shouldDoProjection(conf, toWrite, toRead, AddressBook.class); } @Test public void testPullingInRequiredStructWithFilter() throws Exception { final String projectionFilterDesc = "persons/{id};persons/email"; TBase toWrite = new AddressBook(Arrays.asList(new Person(new Name("Bob", "Roberts"), 0, "bob.roberts@example.com", Arrays.asList(new PhoneNumber("1234567890"))))); //Name is a required field, but is projected out. To make the thrift record pass validation, the name field is filled //with empty string TBase toRead = new AddressBook( Arrays.asList(new Person(new Name("", ""), 0, "bob.roberts@example.com", null))); shouldDoProjectionWithThriftColumnFilter(projectionFilterDesc, toWrite, toRead, AddressBook.class); } @Test public void testReorderdOptionalFields() throws Exception { final String projectionFilter = "**"; StructWithReorderedOptionalFields toWrite = new StructWithReorderedOptionalFields(); toWrite.setFieldOne(1); toWrite.setFieldTwo(2); toWrite.setFieldThree(3); shouldDoProjectionWithThriftColumnFilter(projectionFilter, toWrite, toWrite, StructWithReorderedOptionalFields.class); } @Test public void testProjectOutOptionalFields() throws Exception { final String projectionFilterDesc = "persons/name/*"; TBase toWrite = new AddressBook(Arrays.asList(new Person(new Name("Bob", "Roberts"), 0, "bob.roberts@example.com", Arrays.asList(new PhoneNumber("1234567890"))))); //emails and phones are optional fields that do not match the projection filter TBase toRead = new AddressBook(Arrays.asList(new Person(new Name("Bob", "Roberts"), 0, null, null))); shouldDoProjectionWithThriftColumnFilter(projectionFilterDesc, toWrite, toRead, AddressBook.class); } @Test public void testPullInRequiredMaps() throws Exception { String filter = "name"; Map<String, String> mapValue = new HashMap<String, String>(); mapValue.put("a", "1"); mapValue.put("b", "2"); RequiredMapFixture toWrite = new RequiredMapFixture(mapValue); toWrite.setName("testName"); RequiredMapFixture toRead = new RequiredMapFixture(new HashMap<String, String>()); toRead.setName("testName"); shouldDoProjectionWithThriftColumnFilter(filter, toWrite, toRead, RequiredMapFixture.class); } @Test public void testDropMapValuePrimitive() throws Exception { String filter = "mavalue/key"; Map<String, String> mapValue = new HashMap<String, String>(); mapValue.put("a", "1"); mapValue.put("b", "2"); RequiredMapFixture toWrite = new RequiredMapFixture(mapValue); toWrite.setName("testName"); // for now we expect no value projection to happen // because a sentinel value is selected from the value Map<String, String> readValue = new HashMap<String, String>(); readValue.put("a", "1"); readValue.put("b", "2"); RequiredMapFixture toRead = new RequiredMapFixture(readValue); shouldDoProjectionWithThriftColumnFilter(filter, toWrite, toRead, RequiredMapFixture.class); } private StructV4WithExtracStructField makeStructV4WithExtracStructField(String id) { StructV4WithExtracStructField sv4 = new StructV4WithExtracStructField(); StructV3 sv3 = new StructV3(); sv3.setAge("age " + id); sv3.setGender("gender" + id); sv3.setName("inner name " + id); sv4.setAge("outer age " + id); sv4.setAddedStruct(sv3); sv4.setGender("outer gender " + id); sv4.setName("outer name " + id); return sv4; } @Test public void testDropMapValueStruct() throws Exception { String filter = "reqMap/key"; Map<String, StructV4WithExtracStructField> mapValue = new HashMap<String, StructV4WithExtracStructField>(); StructV4WithExtracStructField v1 = makeStructV4WithExtracStructField("1"); StructV4WithExtracStructField v2 = makeStructV4WithExtracStructField("2"); mapValue.put("key 1", v1); mapValue.put("key 2", v2); MapWithStructValue toWrite = new MapWithStructValue(mapValue); // for now we expect a sentinel column to be kept HashMap<String, StructV4WithExtracStructField> readValue = new HashMap<String, StructV4WithExtracStructField>(); readValue.put("key 1", new StructV4WithExtracStructField("outer name 1")); readValue.put("key 2", new StructV4WithExtracStructField("outer name 2")); MapWithStructValue toRead = new MapWithStructValue(readValue); shouldDoProjectionWithThriftColumnFilter(filter, toWrite, toRead, MapWithStructValue.class); } @Test public void testDropMapValueNestedPrim() throws Exception { String filter = "reqMap/key"; Map<String, Map<String, String>> mapValue = new HashMap<String, Map<String, String>>(); Map<String, String> innerValue1 = new HashMap<String, String>(); innerValue1.put("inner key (1, 1)", "inner (1, 1)"); innerValue1.put("inner key (1, 2)", "inner (1, 2)"); Map<String, String> innerValue2 = new HashMap<String, String>(); innerValue2.put("inner key (2, 1)", "inner (2, 1)"); innerValue2.put("inner key (2, 2)", "inner (2, 2)"); mapValue.put("outer key 1", innerValue1); mapValue.put("outer key 2", innerValue2); MapWithPrimMapValue toWrite = new MapWithPrimMapValue(mapValue); Map<String, Map<String, String>> expected = new HashMap<String, Map<String, String>>(); Map<String, String> expectedInnerValue1 = new HashMap<String, String>(); expectedInnerValue1.put("inner key (1, 1)", "inner (1, 1)"); expectedInnerValue1.put("inner key (1, 2)", "inner (1, 2)"); Map<String, String> expectedInnerValue2 = new HashMap<String, String>(); expectedInnerValue2.put("inner key (2, 1)", "inner (2, 1)"); expectedInnerValue2.put("inner key (2, 2)", "inner (2, 2)"); expected.put("outer key 1", expectedInnerValue1); expected.put("outer key 2", expectedInnerValue2); MapWithPrimMapValue toRead = new MapWithPrimMapValue(expected); shouldDoProjectionWithThriftColumnFilter(filter, toWrite, toRead, MapWithPrimMapValue.class); } @Test public void testDropMapValueNestedStruct() throws Exception { String filter = "reqMap/key"; Map<String, Map<String, StructV4WithExtracStructField>> mapValue = new HashMap<String, Map<String, StructV4WithExtracStructField>>(); Map<String, StructV4WithExtracStructField> innerValue1 = new HashMap<String, StructV4WithExtracStructField>(); innerValue1.put("inner key (1, 1)", makeStructV4WithExtracStructField("inner (1, 1)")); innerValue1.put("inner key (1, 2)", makeStructV4WithExtracStructField("inner (1, 2)")); Map<String, StructV4WithExtracStructField> innerValue2 = new HashMap<String, StructV4WithExtracStructField>(); innerValue2.put("inner key (2, 1)", makeStructV4WithExtracStructField("inner (2, 1)")); innerValue2.put("inner key (2, 2)", makeStructV4WithExtracStructField("inner (2, 2)")); mapValue.put("outer key 1", innerValue1); mapValue.put("outer key 2", innerValue2); MapWithStructMapValue toWrite = new MapWithStructMapValue(mapValue); Map<String, Map<String, StructV4WithExtracStructField>> expected = new HashMap<String, Map<String, StructV4WithExtracStructField>>(); Map<String, StructV4WithExtracStructField> expectedInnerValue1 = new HashMap<String, StructV4WithExtracStructField>(); expectedInnerValue1.put("inner key (1, 1)", new StructV4WithExtracStructField("outer name inner (1, 1)")); expectedInnerValue1.put("inner key (1, 2)", new StructV4WithExtracStructField("outer name inner (1, 2)")); Map<String, StructV4WithExtracStructField> expectedInnerValue2 = new HashMap<String, StructV4WithExtracStructField>(); expectedInnerValue2.put("inner key (2, 1)", new StructV4WithExtracStructField("outer name inner (2, 1)")); expectedInnerValue2.put("inner key (2, 2)", new StructV4WithExtracStructField("outer name inner (2, 2)")); expected.put("outer key 1", expectedInnerValue1); expected.put("outer key 2", expectedInnerValue2); MapWithStructMapValue toRead = new MapWithStructMapValue(expected); shouldDoProjectionWithThriftColumnFilter(filter, toWrite, toRead, MapWithStructMapValue.class); } @Test public void testPullInRequiredLists() throws Exception { String filter = "info"; RequiredListFixture toWrite = new RequiredListFixture( Arrays.asList(new org.apache.parquet.thrift.test.Name("first_name"))); toWrite.setInfo("test_info"); RequiredListFixture toRead = new RequiredListFixture(new ArrayList<org.apache.parquet.thrift.test.Name>()); toRead.setInfo("test_info"); shouldDoProjectionWithThriftColumnFilter(filter, toWrite, toRead, RequiredListFixture.class); } @Test public void testPullInRequiredSets() throws Exception { String filter = "info"; RequiredSetFixture toWrite = new RequiredSetFixture(new HashSet<org.apache.parquet.thrift.test.Name>( Arrays.asList(new org.apache.parquet.thrift.test.Name("first_name")))); toWrite.setInfo("test_info"); RequiredSetFixture toRead = new RequiredSetFixture(new HashSet<org.apache.parquet.thrift.test.Name>()); toRead.setInfo("test_info"); shouldDoProjectionWithThriftColumnFilter(filter, toWrite, toRead, RequiredSetFixture.class); } @Test public void testPullInPrimitiveValues() throws Exception { String filter = "info_string"; RequiredPrimitiveFixture toWrite = new RequiredPrimitiveFixture(true, (byte) 2, (short) 3, 4, (long) 5, (double) 6.0, "7"); toWrite.setInfo_string("it's info"); RequiredPrimitiveFixture toRead = new RequiredPrimitiveFixture(false, (byte) 0, (short) 0, 0, (long) 0, (double) 0.0, ""); toRead.setInfo_string("it's info"); shouldDoProjectionWithThriftColumnFilter(filter, toWrite, toRead, RequiredPrimitiveFixture.class); } private void shouldDoProjectionWithThriftColumnFilter(String filterDesc, TBase toWrite, TBase toRead, Class<? extends TBase<?, ?>> thriftClass) throws Exception { Configuration conf = new Configuration(); conf.set(ThriftReadSupport.THRIFT_COLUMN_FILTER_KEY, filterDesc); shouldDoProjection(conf, toWrite, toRead, thriftClass); } private <T extends TBase<?, ?>> void shouldDoProjection(Configuration conf, T recordToWrite, T exptectedReadResult, Class<? extends TBase<?, ?>> thriftClass) throws Exception { final Path parquetFile = new Path("target/test/TestParquetToThriftReadWriteAndProjection/file.parquet"); final FileSystem fs = parquetFile.getFileSystem(conf); if (fs.exists(parquetFile)) { fs.delete(parquetFile, true); } //create a test file final TProtocolFactory protocolFactory = new TCompactProtocol.Factory(); final TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0); final ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(parquetFile, ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, thriftClass); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos)); recordToWrite.write(protocol); w.write(new BytesWritable(baos.toByteArray())); w.close(); final ParquetThriftInputFormat<T> parquetThriftInputFormat = new ParquetThriftInputFormat<T>(); final Job job = new Job(conf, "read"); job.setInputFormatClass(ParquetThriftInputFormat.class); ParquetThriftInputFormat.setInputPaths(job, parquetFile); final JobID jobID = new JobID("local", 1); List<InputSplit> splits = parquetThriftInputFormat .getSplits(ContextUtil.newJobContext(ContextUtil.getConfiguration(job), jobID)); T readValue = null; for (InputSplit split : splits) { TaskAttemptContext taskAttemptContext = ContextUtil.newTaskAttemptContext( ContextUtil.getConfiguration(job), new TaskAttemptID(new TaskID(jobID, true, 1), 0)); final RecordReader<Void, T> reader = parquetThriftInputFormat.createRecordReader(split, taskAttemptContext); reader.initialize(split, taskAttemptContext); if (reader.nextKeyValue()) { readValue = reader.getCurrentValue(); LOG.info(readValue); } } assertEquals(exptectedReadResult, readValue); } }