org.pentaho.big.data.kettle.plugins.formats.impl.parquet.input.ParquetInput.java Source code

Java tutorial

Introduction

Here is the source code for org.pentaho.big.data.kettle.plugins.formats.impl.parquet.input.ParquetInput.java

Source

/*! ******************************************************************************
 *
 * Pentaho Data Integration
 *
 * Copyright (C) 2018 by Hitachi Vantara : http://www.pentaho.com
 *
 *******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.pentaho.big.data.kettle.plugins.formats.impl.parquet.input;

import org.apache.commons.vfs2.FileObject;
import org.pentaho.big.data.api.cluster.NamedCluster;
import org.pentaho.big.data.api.cluster.service.locator.NamedClusterServiceLocator;
import org.pentaho.big.data.kettle.plugins.formats.parquet.input.ParquetInputField;
import org.pentaho.big.data.kettle.plugins.formats.parquet.input.ParquetInputMetaBase;
import org.pentaho.bigdata.api.format.FormatService;
import org.pentaho.di.core.RowMetaAndData;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.vfs.AliasedFileObject;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
import org.pentaho.di.trans.steps.file.BaseFileInputStep;
import org.pentaho.di.trans.steps.file.IBaseFileInputReader;
import org.pentaho.hadoop.shim.api.format.IParquetInputField;
import org.pentaho.hadoop.shim.api.format.IPentahoInputFormat.IPentahoInputSplit;
import org.pentaho.hadoop.shim.api.format.IPentahoParquetInputFormat;

import java.nio.file.NoSuchFileException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;

public class ParquetInput extends BaseFileInputStep<ParquetInputMeta, ParquetInputData> {
    public static long SPLIT_SIZE = 128 * 1024 * 1024;

    private final NamedClusterServiceLocator namedClusterServiceLocator;

    public ParquetInput(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
            Trans trans, NamedClusterServiceLocator namedClusterServiceLocator) {
        super(stepMeta, stepDataInterface, copyNr, transMeta, trans);
        this.namedClusterServiceLocator = namedClusterServiceLocator;
    }

    @Override
    public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
        meta = (ParquetInputMeta) smi;
        data = (ParquetInputData) sdi;

        try {
            if (data.splits == null) {
                initSplits();
            }

            if (data.currentSplit >= data.splits.size()) {
                setOutputDone();
                return false;
            }

            if (data.reader == null) {
                openReader(data);
            }

            if (data.rowIterator.hasNext()) {
                RowMetaAndData row = data.rowIterator.next();
                putRow(row.getRowMeta(), row.getData());
                return true;
            } else {
                data.reader.close();
                data.reader = null;
                logDebug("Close split {0}", data.currentSplit);
                data.currentSplit++;
                return true;
            }
        } catch (NoSuchFileException ex) {
            throw new KettleException("No input file");
        } catch (KettleException ex) {
            throw ex;
        } catch (Exception ex) {
            throw new KettleException(ex);
        }
    }

    void initSplits() throws Exception {
        FormatService formatService = namedClusterServiceLocator.getService(meta.getNamedCluster(),
                FormatService.class);
        if (meta.inputFiles == null || meta.inputFiles.fileName == null || meta.inputFiles.fileName.length == 0) {
            throw new KettleException("No input files defined");
        }

        String inputFileName = environmentSubstitute(meta.inputFiles.fileName[0]);
        FileObject inputFileObject = KettleVFS.getFileObject(inputFileName, getTransMeta());
        if (AliasedFileObject.isAliasedFile(inputFileObject)) {
            inputFileName = ((AliasedFileObject) inputFileObject).getOriginalURIString();
        }

        data.input = formatService.createInputFormat(IPentahoParquetInputFormat.class);

        // Pentaho 8.0 transformations will have the formatType set to 0. Get the fields from the schema and set the
        // formatType to the formatType retrieved from the schema.
        List<? extends IParquetInputField> actualFileFields = ParquetInput
                .retrieveSchema(meta.getNamedClusterServiceLocator(), meta.getNamedCluster(), inputFileName);

        if (meta.isIgnoreEmptyFolder() && (actualFileFields.size() == 0)) {
            data.splits = new ArrayList<>();
            logBasic("No Parquet input files found.");
        } else {
            Map<String, IParquetInputField> fieldNamesToTypes = actualFileFields.stream()
                    .collect(Collectors.toMap(IParquetInputField::getFormatFieldName, Function.identity()));
            for (ParquetInputField f : meta.getInputFields()) {
                if (fieldNamesToTypes.containsKey(f.getFormatFieldName())) {
                    if (f.getFormatType() == 0) {
                        f.setFormatType(fieldNamesToTypes.get(f.getFormatFieldName()).getFormatType());
                    }
                    f.setPrecision(fieldNamesToTypes.get(f.getFormatFieldName()).getPrecision());
                    f.setScale(fieldNamesToTypes.get(f.getFormatFieldName()).getScale());
                }
            }

            data.input.setSchema(createSchemaFromMeta(meta));
            data.input.setInputFile(inputFileName);
            data.input.setSplitSize(SPLIT_SIZE);

            data.splits = data.input.getSplits();
            logDebug("Input split count: {0}", data.splits.size());
        }
        data.currentSplit = 0;
    }

    void openReader(ParquetInputData data) throws Exception {
        logDebug("Open split {0}", data.currentSplit);
        IPentahoInputSplit sp = data.splits.get(data.currentSplit);
        data.reader = data.input.createRecordReader(sp);
        data.rowIterator = data.reader.iterator();
    }

    @Override
    protected boolean init() {
        return true;
    }

    @Override
    protected IBaseFileInputReader createReader(ParquetInputMeta meta, ParquetInputData data, FileObject file)
            throws Exception {
        return null;
    }

    public static List<? extends IParquetInputField> retrieveSchema(
            NamedClusterServiceLocator namedClusterServiceLocator, NamedCluster namedCluster, String path)
            throws Exception {
        FormatService formatService = namedClusterServiceLocator.getService(namedCluster, FormatService.class);
        IPentahoParquetInputFormat in = formatService.createInputFormat(IPentahoParquetInputFormat.class);
        FileObject inputFileObject = KettleVFS.getFileObject(path);
        if (AliasedFileObject.isAliasedFile(inputFileObject)) {
            path = ((AliasedFileObject) inputFileObject).getOriginalURIString();
        }
        return in.readSchema(path);
    }

    public static List<IParquetInputField> createSchemaFromMeta(ParquetInputMetaBase meta) {
        List<IParquetInputField> fields = new ArrayList<>();
        for (ParquetInputField f : meta.getInputFields()) {
            fields.add(f);
        }
        return fields;
    }
}