org.talend.components.simplefileio.runtime.sinks.ParquetHdfsFileSink.java Source code

Java tutorial

Introduction

Here is the source code for org.talend.components.simplefileio.runtime.sinks.ParquetHdfsFileSink.java

Source

// ============================================================================
//
// Copyright (C) 2006-2017 Talend Inc. - www.talend.com
//
// This source code is available under agreement available at
// %InstallDIR%\features\org.talend.rcp.branding.%PRODUCTNAME%\%PRODUCTNAME%license.txt
//
// You should have received a copy of the agreement
// along with this program; if not, write to Talend SA
// 9 rue Pages 92150 Suresnes, France
//
// ============================================================================
package org.talend.components.simplefileio.runtime.sinks;

import java.util.ArrayList;
import java.util.List;

import org.apache.avro.generic.IndexedRecord;
import org.apache.beam.sdk.values.KV;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.parquet.avro.AvroParquetOutputFormat;
import org.apache.parquet.avro.AvroWriteSupport;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.talend.components.simplefileio.runtime.ugi.UgiDoAs;
import org.talend.components.simplefileio.runtime.utils.FileSystemUtil;

/**
 * Sink for Parquet files.
 */
public class ParquetHdfsFileSink extends UgiFileSinkBase<Void, IndexedRecord> {

    private static final Logger LOG = LoggerFactory.getLogger(ParquetHdfsFileSink.class);

    public ParquetHdfsFileSink(UgiDoAs doAs, String path, boolean overwrite, boolean mergeOutput) {
        super(doAs, path, overwrite, mergeOutput, (Class) AvroParquetOutputFormat.class);
    }

    @Override
    protected void configure(Job job, KV<Void, IndexedRecord> sample) {
        super.configure(job, sample);
        IndexedRecord record = (IndexedRecord) sample.getValue();
        AvroWriteSupport.setSchema(job.getConfiguration(), record.getSchema());
        ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
    }

    @Override
    protected boolean mergeOutput(FileSystem fs, String sourceFolder, String targetFile) {
        try {
            FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
            List<Path> sourceFiles = new ArrayList<>();
            for (FileStatus sourceStatus : sourceStatuses) {
                sourceFiles.add(sourceStatus.getPath());
            }
            FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(sourceFiles, fs.getConf())
                    .getFileMetaData();
            ParquetFileWriter writer = new ParquetFileWriter(fs.getConf(), mergedMeta.getSchema(),
                    new Path(targetFile), ParquetFileWriter.Mode.CREATE);
            writer.start();
            for (Path input : sourceFiles) {
                writer.appendFile(fs.getConf(), input);
            }
            writer.end(mergedMeta.getKeyValueMetaData());
        } catch (Exception e) {
            LOG.error("Error when merging files in {}.\n{}", sourceFolder, e.getMessage());
            return false;
        }
        return true;
    }

}