org.openbi.kettle.plugins.parquetoutput.ParquetOutput.java Source code

Java tutorial

Introduction

Here is the source code for org.openbi.kettle.plugins.parquetoutput.ParquetOutput.java

Source

/*! ******************************************************************************
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

package org.openbi.kettle.plugins.parquetoutput;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.commons.vfs.FileObject;
import org.apache.hadoop.fs.Path;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.ResultFile;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleFileException;
import org.pentaho.di.core.exception.KettleStepException;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.variables.VariableSpace;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
import parquet.avro.AvroParquetWriter;
import parquet.hadoop.ParquetWriter;
import parquet.hadoop.metadata.CompressionCodecName;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

/**
 * Converts input rows to avro and then writes this avro to one or more files.
 *
 * @author Inquidia Consulting
 */
public class ParquetOutput extends BaseStep implements StepInterface {
    private static Class<?> PKG = ParquetOutputMeta.class; // for i18n purposes, needed by Translator2!!

    public ParquetOutputMeta meta;

    public ParquetOutputData data;

    private ParquetOutputField[] avroOutputFields;
    private int outputFieldIndex;

    public ParquetOutput(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
            Trans trans) {
        super(stepMeta, stepDataInterface, copyNr, transMeta, trans);
    }

    private GenericRecord getRecord(Object[] r, String parentPath, Schema inputSchema) throws KettleException {
        String parentName = "";
        if (parentPath != null) {
            parentName = new String(parentPath);
        }

        Schema recordSchema = inputSchema;

        List<Schema> unionSchemas = null;
        if (inputSchema.getType() == Schema.Type.UNION) {
            unionSchemas = inputSchema.getTypes();
            if (unionSchemas != null) {
                for (int i = 0; i < unionSchemas.size(); i++) {
                    if (unionSchemas.get(i).getType() == Schema.Type.RECORD) {
                        recordSchema = unionSchemas.get(i);
                        break;
                    }
                }
            }
        }

        GenericRecord result = new GenericData.Record(recordSchema);
        while (outputFieldIndex < avroOutputFields.length) {

            ParquetOutputField aof = avroOutputFields[outputFieldIndex];
            String avroName = aof.getPath();

            if (avroName.startsWith("$.")) {
                avroName = avroName.substring(2);
            }
            if (parentName == null || parentName.length() == 0 || avroName.startsWith(parentName + ".")) {
                if (parentName != null && parentName.length() > 0) {
                    avroName = avroName.substring(parentName.length() + 1);
                }
                if (avroName.contains(".")) {
                    String currentAvroPath = avroName.substring(0, avroName.indexOf("."));
                    Schema childSchema = recordSchema.getField(currentAvroPath).schema();
                    String childPath = parentName + "." + currentAvroPath;
                    if (parentName == null || parentName.length() == 0) {
                        childPath = currentAvroPath;
                    }

                    GenericRecord fieldRecord = getRecord(r, childPath, childSchema);
                    result.put(currentAvroPath, fieldRecord);
                } else {
                    Object value = getValue(r, meta.getOutputFields()[outputFieldIndex],
                            data.fieldnrs[outputFieldIndex]);
                    if (value != null) {
                        result.put(avroName, value);
                    }
                    outputFieldIndex++;
                }
            } else {
                break;
            }
        }
        return result;
    }

    public Schema createAvroSchema(List<ParquetOutputField> avroFields, String parentPath) throws KettleException {
        //Get standard schema stuff
        String doc = "Generated by Parquet Output Step";
        String recordName = "parquet";
        String namespace = "pentaho";

        //do not want to have to deal with $. paths.
        if (parentPath.startsWith("$.")) {
            parentPath = parentPath.substring(2);
        }

        if (parentPath.endsWith(".")) {
            parentPath = parentPath.substring(0, parentPath.length() - 1);
        }

        // If the parent path is not empty the doc and recordname should not be the default
        if (!parentPath.isEmpty()) {
            doc = "Auto generated for path " + parentPath;
            recordName = parentPath.replaceAll("[^A-Za-z0-9\\_]", "_");
        }

        //Create the result schema
        Schema result = Schema.createRecord(recordName, doc, namespace, false);

        List<Schema.Field> resultFields = new ArrayList<Schema.Field>();

        //Can not use an iterator because we will change the list in the middle of this loop
        for (int i = 0; i < avroFields.size(); i++) {
            if (avroFields.get(i) != null) {
                ParquetOutputField field = avroFields.get(i);

                String avroName = field.getPath();

                //Get rid of the $. stuff
                if (avroName.startsWith("$.")) {
                    avroName = avroName.substring(2);
                }

                //The avroName includes the parent path.  We do not want the parent path for our evaluation.
                String finalName = avroName;
                if (!parentPath.isEmpty()) {
                    finalName = avroName.substring(parentPath.length() + 1);
                }

                if (finalName.contains(".")) //It has children, perform children processing.
                {
                    StringBuilder builder = new StringBuilder();
                    if (!parentPath.isEmpty()) {
                        builder.append(parentPath).append(".");
                    }
                    builder.append(finalName.substring(0, finalName.indexOf("."))).append(".");
                    String subPath = builder.toString();
                    List<ParquetOutputField> subFields = new ArrayList<ParquetOutputField>();
                    subFields.add(field);
                    boolean nullable = field.getNullable();
                    for (int e = i + 1; e < avroFields.size(); e++) {
                        if (avroFields.get(e) != null) {
                            ParquetOutputField subFieldCandidate = avroFields.get(e);

                            String candidateName = subFieldCandidate.getPath();
                            if (candidateName.startsWith("$.")) {
                                candidateName = candidateName.substring(2);
                            }

                            if (candidateName.startsWith(subPath)) {
                                if (nullable) {
                                    nullable = subFieldCandidate.getNullable();
                                }

                                subFields.add(subFieldCandidate);
                                avroFields.remove(e);
                                e--;
                            }
                        }
                    }
                    subPath = subPath.substring(0, subPath.length() - 1);

                    Schema subSchema = createAvroSchema(subFields, subPath);
                    Schema outSchema = subSchema;
                    if (nullable) {
                        Schema nullSchema = Schema.create(Schema.Type.NULL);
                        List<Schema> unionList = new ArrayList<Schema>();
                        unionList.add(nullSchema);
                        unionList.add(subSchema);
                        Schema unionSchema = Schema.createUnion(unionList);
                        outSchema = unionSchema;
                    }
                    Schema.Field schemaField = new Schema.Field(finalName.substring(0, finalName.indexOf(".")),
                            outSchema, null, null);
                    resultFields.add(schemaField);
                } else { //Is not a sub field create the field.
                    int fieldIndex = data.outputRowMeta.indexOfValue(field.getName());
                    Schema.Type fieldType = ParquetOutputField
                            .getDefaultAvroType(data.outputRowMeta.getValueMeta(fieldIndex).getType());
                    Schema fieldSchema = Schema.create(fieldType);
                    Schema outSchema;
                    if (field.getNullable()) {
                        Schema nullSchema = Schema.create(Schema.Type.NULL);

                        List<Schema> unionSchema = new ArrayList<Schema>();
                        unionSchema.add(nullSchema);
                        unionSchema.add(fieldSchema);
                        outSchema = Schema.createUnion(unionSchema);
                    } else {
                        outSchema = fieldSchema;
                    }
                    Schema.Field outField = new Schema.Field(finalName, outSchema, null, null);
                    resultFields.add(outField);
                }
            }
        }

        result.setFields(resultFields);
        return result;
    }

    public synchronized boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
        meta = (ParquetOutputMeta) smi;
        data = (ParquetOutputData) sdi;

        boolean result = true;
        Object[] r = getRow(); // This also waits for a row to be finished.

        if (r != null && first) {
            first = false;

            data.outputRowMeta = getInputRowMeta().clone();
            meta.getFields(data.outputRowMeta, getStepname(), null, null, this, repository, metaStore);

            avroOutputFields = meta.getOutputFields();

            List<ParquetOutputField> fields = new ArrayList<ParquetOutputField>();
            for (ParquetOutputField avroField : meta.getOutputFields()) {
                fields.add(avroField);

            }
            data.avroSchema = createAvroSchema(fields, "");

            Arrays.sort(avroOutputFields);

            data.fieldnrs = new int[avroOutputFields.length];
            for (int i = 0; i < avroOutputFields.length; i++) {
                if (avroOutputFields[i].validate()) {
                    data.fieldnrs[i] = data.outputRowMeta.indexOfValue(avroOutputFields[i].getName());
                    if (data.fieldnrs[i] < 0) {
                        throw new KettleStepException("Field [" + avroOutputFields[i].getName()
                                + "] couldn't be found in the input stream!");
                    }
                }
            }

        }

        if (r == null) {
            // no more input to be expected...
            closeFile();
            setOutputDone();
            data.avroSchema = null;
            return false;
        }

        String filename = meta.getFilename();

        if (meta.isAcceptFilenameFromField()) {
            filename = getInputRowMeta().getString(r, environmentSubstitute(meta.getFilenameField()), "");
        }

        if (Const.isEmpty(filename)) {
            throw new KettleException("Filename is empty!");
        }

        int fileIndex = data.openFiles.indexOf(filename);

        if (fileIndex < 0) {

            try {
                openNewFile(meta.getFilename());
            } catch (Exception e) {
                logError("Couldn't open file " + meta.getFilename(), e);
                setErrors(1L);
                stopAll();
                return false;
            }
            fileIndex = data.openFiles.indexOf(filename);
        }

        if (fileIndex < 0) {
            logError("Failed to open file.");
            return false;
        }

        outputFieldIndex = 0;
        GenericRecord row = getRecord(r, null, data.avroSchema);

        try {
            data.parquetWriters.get(fileIndex).write(row);
        } catch (IOException ex) {
            throw new KettleException("Error writing row", ex);
        }

        // First handle the file name in field
        // Write a header line as well if needed
        //
        putRow(data.outputRowMeta, r); // in case we want it to go further...

        if (checkFeedback(getLinesOutput())) {
            logBasic("linenr " + getLinesOutput());
        }

        return result;
    }

    public Object getValue(Object[] r, ParquetOutputField outputField, int inputFieldIndex) throws KettleException {
        Object value;

        switch (data.outputRowMeta.getValueMeta(inputFieldIndex).getType()) {
        case ValueMetaInterface.TYPE_INTEGER:
            value = data.outputRowMeta.getInteger(r, inputFieldIndex);
            break;
        case ValueMetaInterface.TYPE_BIGNUMBER:
        case ValueMetaInterface.TYPE_NUMBER:
            value = data.outputRowMeta.getNumber(r, inputFieldIndex);
            break;
        case ValueMetaInterface.TYPE_BOOLEAN:
            value = data.outputRowMeta.getBoolean(r, inputFieldIndex);
            break;
        default:
            value = data.outputRowMeta.getString(r, inputFieldIndex);
            break;
        }

        return value;
    }

    public String buildFilename(String filename) {
        return meta.buildFilename(filename, this, getCopy(), getPartitionID(), meta);
    }

    public void openNewFile(String baseFilename) throws KettleException {
        if (baseFilename == null) {
            throw new KettleFileException(BaseMessages.getString(PKG, "ParquetOutput.Exception.FileNameNotSet"));
        }

        String filename = buildFilename(environmentSubstitute(baseFilename));

        try {
            // Check for parent folder creation only if the user asks for it
            //
            if (meta.isCreateParentFolder()) {
                createParentFolder(filename);
            }

            if (log.isDetailed()) {
                logDetailed("Opening output file in default encoding");
            }

            String compressionCodec = environmentSubstitute(meta.getCompressionCodec());

            if (Const.isEmpty(compressionCodec) || compressionCodec.equalsIgnoreCase("none")) {
                compressionCodec = "uncompressed";
            }

            CompressionCodecName compressionCodecName = CompressionCodecName.fromConf(compressionCodec);

            //Convert to bytes
            int blockSize = -1;

            blockSize = Const.toInt(environmentSubstitute(meta.getBlockSize()), -1) * 1024 * 1024;

            if (blockSize <= 0) {
                throw new KettleException("Error setting block size.  Must be greater than 0.");
            }

            int pageSize = Const.toInt(environmentSubstitute(meta.getPageSize()), -1) * 1024;

            if (pageSize <= 0) {
                throw new KettleException("Error setting page size.  Must be greater than 0.");
            }

            /* HadoopConfiguration hadoopConfiguration =
               HadoopConfigurationBootstrap.getHadoopConfigurationProvider().getActiveConfiguration();
             HadoopShim shim = hadoopConfiguration.getHadoopShim();
             Configuration conf = shim.createConfiguration();
                
             FileSystem fs = shim.getFileSystem( conf );
             Path path = fs.asPath( file.getName().getURI() );
             */

            FileObject file = KettleVFS.getFileObject(filename);

            //Path path = shim.getFileSystem( conf ).asPath( file.getName().getURI() );
            Path path = new Path(file.getName().getURI());

            if (meta.isCleanOutput() && file.exists()) {
                file.delete();
            }

            data.parquetWriters
                    .add(new AvroParquetWriter(path, data.avroSchema, compressionCodecName, blockSize, pageSize));
            data.openFiles.add(baseFilename);

            if (log.isDetailed()) {
                logDetailed("Opened new file with name [" + filename + "]");
            }
        } catch (Exception e) {
            throw new KettleException("Error opening new file : " + e.toString(), e);
        }

        data.splitnr++;

        if (meta.isAddToResult()) {
            // Add this to the result file names...
            ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_GENERAL,
                    getFileObject(filename, getTransMeta()), getTransMeta().getName(), getStepname());
            resultFile.setComment(BaseMessages.getString(PKG, "AvroOutput.AddResultFile"));
            addResultFile(resultFile);
        }
    }

    private boolean closeFile() {
        boolean retval = false;

        if (data.parquetWriters != null) {
            Iterator<ParquetWriter> openFiles = data.parquetWriters.iterator();
            while (openFiles.hasNext()) {
                ParquetWriter writer = openFiles.next();
                if (writer != null) {
                    try {
                        writer.close();
                    } catch (Exception e) {
                        logBasic("Error trying to close file.  This may not be a problem.");
                        logDetailed("Stack trace from error trying to close file:", e);
                    }
                    writer = null;
                }
            }

            if (log.isDebug()) {
                logDebug("Closed all open parquet writers.");
            }
        }

        return retval;
    }

    public boolean init(StepMetaInterface smi, StepDataInterface sdi) {
        meta = (ParquetOutputMeta) smi;
        data = (ParquetOutputData) sdi;

        if (super.init(smi, sdi)) {
            data.splitnr = 0;

            return true;
        }

        return false;
    }

    public void dispose(StepMetaInterface smi, StepDataInterface sdi) {
        meta = (ParquetOutputMeta) smi;
        data = (ParquetOutputData) sdi;

        if (data.parquetWriters.size() > 0) {
            closeFile();

        }
        data.parquetWriters.clear();
        data.avroSchema = null;

        super.dispose(smi, sdi);
    }

    private void createParentFolder(String filename) throws Exception {
        // Check for parent folder
        FileObject parentfolder = null;
        FileObject schemaParentFolder = null;
        try {
            // Get parent folder
            parentfolder = getFileObject(filename).getParent();
            if (parentfolder.exists()) {
                if (isDetailed()) {
                    logDetailed(BaseMessages.getString(PKG, "AvroOutput.Log.ParentFolderExist",
                            parentfolder.getName()));
                }
            } else {
                if (isDetailed()) {
                    logDetailed(BaseMessages.getString(PKG, "AvroOutput.Log.ParentFolderNotExist",
                            parentfolder.getName()));
                }
                if (meta.isCreateParentFolder()) {
                    parentfolder.createFolder();
                    if (isDetailed()) {
                        logDetailed(BaseMessages.getString(PKG, "AvroOutput.Log.ParentFolderCreated",
                                parentfolder.getName()));
                    }
                } else {
                    throw new KettleException(BaseMessages.getString(PKG,
                            "AvroOutput.Log.ParentFolderNotExistCreateIt", parentfolder.getName(), filename));
                }
            }

        } finally {
            if (parentfolder != null) {
                try {
                    parentfolder.close();
                } catch (Exception ex) {
                    // Ignore
                }
            }

        }
    }

    protected FileObject getFileObject(String vfsFilename) throws KettleFileException {
        return KettleVFS.getFileObject(vfsFilename);
    }

    protected FileObject getFileObject(String vfsFilename, VariableSpace space) throws KettleFileException {
        return KettleVFS.getFileObject(vfsFilename, space);
    }

}