org.apache.gobblin.writer.ParquetHdfsDataWriterTest.java Source code

Introduction

Here is the source code for org.apache.gobblin.writer.ParquetHdfsDataWriterTest.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.gobblin.writer;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import org.apache.gobblin.configuration.ConfigurationKeys;
import org.apache.gobblin.configuration.State;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.testng.Assert;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;

import parquet.example.data.Group;
import parquet.example.data.simple.convert.GroupRecordConverter;
import parquet.hadoop.ParquetReader;
import parquet.hadoop.api.InitContext;
import parquet.hadoop.api.ReadSupport;
import parquet.io.api.RecordMaterializer;
import parquet.schema.MessageType;

import static org.apache.gobblin.writer.ParquetDataWriterBuilder.WRITER_PARQUET_DICTIONARY;
import static org.apache.gobblin.writer.ParquetDataWriterBuilder.WRITER_PARQUET_DICTIONARY_PAGE_SIZE;
import static org.apache.gobblin.writer.ParquetDataWriterBuilder.WRITER_PARQUET_PAGE_SIZE;
import static org.apache.gobblin.writer.ParquetDataWriterBuilder.WRITER_PARQUET_VALIDATE;

@Test(groups = { "gobblin.writer" })
public class ParquetHdfsDataWriterTest {

    private MessageType schema;
    private String filePath;
    private ParquetHdfsDataWriter writer;
    private State properties;

    @BeforeMethod
    public void setUp() throws Exception {
        // Making the staging and/or output dirs if necessary
        File stagingDir = new File(TestConstants.TEST_STAGING_DIR);
        File outputDir = new File(TestConstants.TEST_OUTPUT_DIR);
        if (!stagingDir.exists()) {
            boolean mkdirs = stagingDir.mkdirs();
            assert mkdirs;
        }
        if (!outputDir.exists()) {
            boolean mkdirs = outputDir.mkdirs();
            assert mkdirs;
        }
        this.schema = TestConstants.PARQUET_SCHEMA;
        this.filePath = getFilePath();
        this.properties = createStateWithConfig();
        this.writer = (ParquetHdfsDataWriter) getParquetDataWriterBuilder().build();
    }

    private String getFilePath() {
        return TestConstants.TEST_EXTRACT_NAMESPACE.replaceAll("\\.", "/") + "/" + TestConstants.TEST_EXTRACT_TABLE
                + "/" + TestConstants.TEST_EXTRACT_ID + "_" + TestConstants.TEST_EXTRACT_PULL_TYPE;
    }

    private State createStateWithConfig() {
        State properties = new State();
        properties.setProp(ConfigurationKeys.WRITER_BUFFER_SIZE, ConfigurationKeys.DEFAULT_BUFFER_SIZE);
        properties.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, TestConstants.TEST_FS_URI);
        properties.setProp(ConfigurationKeys.WRITER_STAGING_DIR, TestConstants.TEST_STAGING_DIR);
        properties.setProp(ConfigurationKeys.WRITER_OUTPUT_DIR, TestConstants.TEST_OUTPUT_DIR);
        properties.setProp(ConfigurationKeys.WRITER_FILE_PATH, this.filePath);
        properties.setProp(ConfigurationKeys.WRITER_FILE_NAME, TestConstants.PARQUET_TEST_FILENAME);
        properties.setProp(WRITER_PARQUET_DICTIONARY, true);
        properties.setProp(WRITER_PARQUET_DICTIONARY_PAGE_SIZE, 1024);
        properties.setProp(WRITER_PARQUET_PAGE_SIZE, 1024);
        properties.setProp(WRITER_PARQUET_VALIDATE, true);
        return properties;
    }

    private ParquetDataWriterBuilder getParquetDataWriterBuilder() {
        ParquetDataWriterBuilder writerBuilder = new ParquetDataWriterBuilder();
        writerBuilder.destination = Destination.of(Destination.DestinationType.HDFS, properties);
        writerBuilder.writerId = TestConstants.TEST_WRITER_ID;
        writerBuilder.schema = this.schema;
        writerBuilder.format = WriterOutputFormat.PARQUET;
        return writerBuilder;
    }

    private List<Group> readParquetFiles(File outputFile) throws IOException {
        ParquetReader<Group> reader = null;
        List<Group> records = new ArrayList<>();
        try {
            reader = new ParquetReader<>(new Path(outputFile.toString()), new SimpleReadSupport());
            for (Group value = reader.read(); value != null; value = reader.read()) {
                records.add(value);
            }
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (Exception ex) {
                    System.out.println(ex.getMessage());
                }
            }
        }
        return records;
    }

    @Test
    public void testWrite() throws Exception {
        long firstWrite;
        long secondWrite;
        List<Group> records;
        Group record1 = TestConstants.PARQUET_RECORD_1;
        Group record2 = TestConstants.PARQUET_RECORD_2;
        String filePath = TestConstants.TEST_OUTPUT_DIR + Path.SEPARATOR + this.filePath;
        File outputFile = new File(filePath, TestConstants.PARQUET_TEST_FILENAME);

        this.writer.write(record1);
        firstWrite = this.writer.recordsWritten();
        this.writer.write(record2);
        secondWrite = this.writer.recordsWritten();
        this.writer.close();
        this.writer.commit();
        records = readParquetFiles(outputFile);
        Group resultRecord1 = records.get(0);
        Group resultRecord2 = records.get(1);

        Assert.assertEquals(firstWrite, 1);
        Assert.assertEquals(secondWrite, 2);
        Assert.assertEquals(resultRecord1.getString("name", 0), "tilak");
        Assert.assertEquals(resultRecord1.getInteger("age", 0), 22);
        Assert.assertEquals(resultRecord2.getString("name", 0), "other");
        Assert.assertEquals(resultRecord2.getInteger("age", 0), 22);
    }

    @AfterClass
    public void tearDown() throws IOException {
        // Clean up the staging and/or output directories if necessary
        File testRootDir = new File(TestConstants.TEST_ROOT_DIR);
        if (testRootDir.exists()) {
            FileUtil.fullyDelete(testRootDir);
        }
    }

    class SimpleReadSupport extends ReadSupport<Group> {
        @Override
        public RecordMaterializer<Group> prepareForRead(Configuration conf, Map<String, String> metaData,
                MessageType schema, ReadContext context) {
            return new GroupRecordConverter(schema);
        }

        @Override
        public ReadContext init(InitContext context) {
            return new ReadContext(context.getFileSchema());
        }
    }
}