Java tutorial
/*! ****************************************************************************** * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.openbi.kettle.plugins.avrooutput; import org.apache.avro.Schema; import org.apache.avro.file.CodecFactory; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.commons.vfs.FileObject; import org.pentaho.di.core.Const; import org.pentaho.di.core.ResultFile; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleFileException; import org.pentaho.di.core.exception.KettleStepException; import org.pentaho.di.core.variables.VariableSpace; import org.pentaho.di.core.vfs.KettleVFS; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; import java.io.BufferedOutputStream; import java.io.File; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * Converts input rows to avro and then writes this avro to one or more files. * * @author Inquidia Consulting */ public class AvroOutput extends BaseStep implements StepInterface { private static Class<?> PKG = AvroOutputMeta.class; // for i18n purposes, needed by Translator2!! public AvroOutputMeta meta; public AvroOutputData data; private AvroOutputField[] avroOutputFields; private int outputFieldIndex; public AvroOutput(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans) { super(stepMeta, stepDataInterface, copyNr, transMeta, trans); } private GenericRecord getRecord(Object[] r, String parentPath, Schema inputSchema) throws KettleException { String parentName = ""; if (parentPath != null) { parentName = new String(parentPath); } Schema recordSchema = inputSchema; List<Schema> unionSchemas = null; if (inputSchema.getType() == Schema.Type.UNION) { unionSchemas = inputSchema.getTypes(); if (unionSchemas != null) { for (int i = 0; i < unionSchemas.size(); i++) { if (unionSchemas.get(i).getType() == Schema.Type.RECORD) { recordSchema = unionSchemas.get(i); break; } } } } GenericRecord result = new GenericData.Record(recordSchema); while (outputFieldIndex < avroOutputFields.length) { AvroOutputField aof = avroOutputFields[outputFieldIndex]; String avroName = aof.getAvroName(); if (avroName.startsWith("$.")) { avroName = avroName.substring(2); } if (parentName == null || parentName.length() == 0 || avroName.startsWith(parentName + ".")) { if (parentName != null && parentName.length() > 0) { avroName = avroName.substring(parentName.length() + 1); } if (avroName.contains(".")) { String currentAvroPath = avroName.substring(0, avroName.indexOf(".")); Schema childSchema = recordSchema.getField(currentAvroPath).schema(); String childPath = parentName + "." + currentAvroPath; if (parentName == null || parentName.length() == 0) { childPath = currentAvroPath; } GenericRecord fieldRecord = getRecord(r, childPath, childSchema); result.put(currentAvroPath, fieldRecord); } else { Object value = getValue(r, meta.getOutputFields()[outputFieldIndex], data.fieldnrs[outputFieldIndex]); if (value != null) { result.put(avroName, value); } outputFieldIndex++; } } else { break; } } return result; } public Schema createAvroSchema(List<AvroOutputField> avroFields, String parentPath) throws KettleException { //Get standard schema stuff String doc = meta.getDoc(); String recordName = meta.getRecordName(); String namespace = meta.getNamespace(); //do not want to have to deal with $. paths. if (parentPath.startsWith("$.")) { parentPath = parentPath.substring(2); } if (parentPath.endsWith(".")) { parentPath = parentPath.substring(0, parentPath.length() - 1); } // If the parent path is not empty the doc and recordname should not be the default if (!parentPath.isEmpty()) { doc = "Auto generated for path " + parentPath; recordName = parentPath.replaceAll("[^A-Za-z0-9\\_]", "_"); } //Create the result schema Schema result = Schema.createRecord(recordName, doc, namespace, false); List<Schema.Field> resultFields = new ArrayList<Schema.Field>(); //Can not use an iterator because we will change the list in the middle of this loop for (int i = 0; i < avroFields.size(); i++) { if (avroFields.get(i) != null) { AvroOutputField field = avroFields.get(i); String avroName = field.getAvroName(); //Get rid of the $. stuff if (avroName.startsWith("$.")) { avroName = avroName.substring(2); } //The avroName includes the parent path. We do not want the parent path for our evaluation. String finalName = avroName; if (!parentPath.isEmpty()) { finalName = avroName.substring(parentPath.length() + 1); } if (finalName.contains(".")) //It has children, perform children processing. { StringBuilder builder = new StringBuilder(); if (!parentPath.isEmpty()) { builder.append(parentPath).append("."); } builder.append(finalName.substring(0, finalName.indexOf("."))).append("."); String subPath = builder.toString(); List<AvroOutputField> subFields = new ArrayList<AvroOutputField>(); subFields.add(field); boolean nullable = field.getNullable(); for (int e = i + 1; e < avroFields.size(); e++) { if (avroFields.get(e) != null) { AvroOutputField subFieldCandidate = avroFields.get(e); String candidateName = subFieldCandidate.getAvroName(); if (candidateName.startsWith("$.")) { candidateName = candidateName.substring(2); } if (candidateName.startsWith(subPath)) { if (nullable) { nullable = subFieldCandidate.getNullable(); } subFields.add(subFieldCandidate); avroFields.remove(e); e--; } } } subPath = subPath.substring(0, subPath.length() - 1); Schema subSchema = createAvroSchema(subFields, subPath); Schema outSchema = subSchema; if (nullable) { Schema nullSchema = Schema.create(Schema.Type.NULL); List<Schema> unionList = new ArrayList<Schema>(); unionList.add(nullSchema); unionList.add(subSchema); Schema unionSchema = Schema.createUnion(unionList); outSchema = unionSchema; } Schema.Field schemaField = new Schema.Field(finalName.substring(0, finalName.indexOf(".")), outSchema, null, null); resultFields.add(schemaField); } else { //Is not a sub field create the field. Schema fieldSchema = Schema.create(field.getAvroSchemaType()); Schema outSchema; if (field.getNullable()) { Schema nullSchema = Schema.create(Schema.Type.NULL); List<Schema> unionSchema = new ArrayList<Schema>(); unionSchema.add(nullSchema); unionSchema.add(fieldSchema); outSchema = Schema.createUnion(unionSchema); } else { outSchema = fieldSchema; } Schema.Field outField = new Schema.Field(finalName, outSchema, null, null); resultFields.add(outField); } } } result.setFields(resultFields); return result; } public void writeSchemaFile() throws KettleException { List<AvroOutputField> fields = new ArrayList<AvroOutputField>(); for (AvroOutputField avroField : meta.getOutputFields()) { fields.add(avroField); } data.avroSchema = createAvroSchema(fields, ""); if (log.isDetailed()) { logDetailed("Automatically generated Avro schema."); } if (meta.getWriteSchemaFile()) { if (log.isDetailed()) { logDetailed("Writing schema file."); } try { String schemaFileName = buildFilename(environmentSubstitute(meta.getSchemaFileName()), true); if (meta.getCreateParentFolder()) { logDetailed("Creating parent folder for schema file"); createParentFolder(schemaFileName); } OutputStream outputStream = getOutputStream(schemaFileName, getTransMeta(), false); if (log.isDetailed()) { logDetailed("Opening output stream in default encoding"); } OutputStream schemaWriter = new BufferedOutputStream(outputStream, 5000); if (log.isDetailed()) { logDetailed("Opened new file with name [" + schemaFileName + "]"); } schemaWriter.write(data.avroSchema.toString(true).getBytes()); schemaWriter.close(); schemaWriter = null; if (log.isDetailed()) { logDetailed("Closed schema file with name [" + schemaFileName + "]"); } } catch (Exception e) { throw new KettleException("Error opening new file : " + e.toString()); } } } public synchronized boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException { meta = (AvroOutputMeta) smi; data = (AvroOutputData) sdi; boolean result = true; Object[] r = getRow(); // This also waits for a row to be finished. if (r != null && first) { first = false; avroOutputFields = meta.getOutputFields(); try { if (meta.getCreateSchemaFile()) { logDetailed("Generating Avro schema."); writeSchemaFile(); } else { logDetailed("Reading Avro schema from file."); data.avroSchema = new Schema.Parser().parse(new File(meta.getSchemaFileName())); } data.datumWriter = new GenericDatumWriter<GenericRecord>(data.avroSchema); data.dataFileWriter = new DataFileWriter<GenericRecord>(data.datumWriter); if (!Const.isEmpty(meta.getCompressionType()) && !meta.getCompressionType().equalsIgnoreCase("none")) { data.dataFileWriter.setCodec(CodecFactory.fromString(meta.getCompressionType())); } data.dataFileWriter.create(data.avroSchema, data.writer); } catch (IOException ex) { logError("Could not open or create file " + meta.getSchemaFileName(), ex); setErrors(1L); stopAll(); return false; } Arrays.sort(avroOutputFields); data.outputRowMeta = getInputRowMeta().clone(); meta.getFields(data.outputRowMeta, getStepname(), null, null, this, repository, metaStore); data.fieldnrs = new int[avroOutputFields.length]; for (int i = 0; i < avroOutputFields.length; i++) { if (avroOutputFields[i].validate()) { data.fieldnrs[i] = data.outputRowMeta.indexOfValue(avroOutputFields[i].getName()); if (data.fieldnrs[i] < 0) { throw new KettleStepException("Field [" + avroOutputFields[i].getName() + "] couldn't be found in the input stream!"); } } } } if (r == null) { // no more input to be expected... closeFile(); setOutputDone(); data.datumWriter = null; data.avroSchema = null; return false; } outputFieldIndex = 0; GenericRecord row = getRecord(r, null, data.avroSchema); try { data.dataFileWriter.append(row); } catch (IOException i) { throw new KettleException(i); } // First handle the file name in field // Write a header line as well if needed // putRow(data.outputRowMeta, r); // in case we want it to go further... if (checkFeedback(getLinesOutput())) { logBasic("linenr " + getLinesOutput()); } return result; } public Object getValue(Object[] r, AvroOutputField outputField, int inputFieldIndex) throws KettleException { Object value; switch (outputField.getAvroType()) { case AvroOutputField.AVRO_TYPE_INT: value = data.outputRowMeta.getInteger(r, inputFieldIndex).intValue(); break; case AvroOutputField.AVRO_TYPE_STRING: value = data.outputRowMeta.getString(r, inputFieldIndex); break; case AvroOutputField.AVRO_TYPE_LONG: value = data.outputRowMeta.getInteger(r, inputFieldIndex); break; case AvroOutputField.AVRO_TYPE_FLOAT: value = data.outputRowMeta.getNumber(r, inputFieldIndex).floatValue(); break; case AvroOutputField.AVRO_TYPE_DOUBLE: value = data.outputRowMeta.getNumber(r, inputFieldIndex); break; case AvroOutputField.AVRO_TYPE_BOOLEAN: value = data.outputRowMeta.getBoolean(r, inputFieldIndex); break; default: throw new KettleException("Avro type " + outputField.getAvroTypeDesc() + " is not supported for field " + outputField.getAvroName() + "."); } return value; } public String buildFilename(String filename, boolean ziparchive) { return meta.buildFilename(filename, this, getCopy(), getPartitionID(), data.splitnr, ziparchive, meta); } public void openNewFile(String baseFilename) throws KettleException { if (baseFilename == null) { throw new KettleFileException(BaseMessages.getString(PKG, "AvroOutput.Exception.FileNameNotSet")); } data.writer = null; String filename = buildFilename(environmentSubstitute(baseFilename), true); try { // Check for parent folder creation only if the user asks for it // if (meta.getCreateParentFolder()) { createParentFolder(filename); } OutputStream outputStream = getOutputStream(filename, getTransMeta(), false); if (log.isDetailed()) { logDetailed("Opening output stream in default encoding"); } data.writer = new BufferedOutputStream(outputStream, 5000); if (log.isDetailed()) { logDetailed("Opened new file with name [" + filename + "]"); } } catch (Exception e) { throw new KettleException("Error opening new file : " + e.toString()); } data.splitnr++; if (meta.getAddToResultFiles()) { // Add this to the result file names... ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_GENERAL, getFileObject(filename, getTransMeta()), getTransMeta().getName(), getStepname()); resultFile.setComment(BaseMessages.getString(PKG, "AvroOutput.AddResultFile")); addResultFile(resultFile); } } private boolean closeFile() { boolean retval = false; try { if (data.writer != null) { data.writer.flush(); if (log.isDebug()) { logDebug("Closing output stream"); } if (data.dataFileWriter != null) { data.dataFileWriter.close(); } data.writer.close(); data.writer = null; data.dataFileWriter = null; if (log.isDebug()) { logDebug("Closed output stream"); } } data.datumWriter = null; data.avroSchema = null; retval = true; } catch (Exception e) { logError("Exception trying to close file: " + e.toString()); setErrors(1); retval = false; } return retval; } public boolean init(StepMetaInterface smi, StepDataInterface sdi) { meta = (AvroOutputMeta) smi; data = (AvroOutputData) sdi; if (super.init(smi, sdi)) { data.splitnr = 0; try { openNewFile(meta.getFileName()); } catch (Exception e) { logError("Couldn't open file " + meta.getFileName(), e); setErrors(1L); stopAll(); } return true; } return false; } public void dispose(StepMetaInterface smi, StepDataInterface sdi) { meta = (AvroOutputMeta) smi; data = (AvroOutputData) sdi; if (data.writer != null) { closeFile(); } data.datumWriter = null; data.avroSchema = null; super.dispose(smi, sdi); } private void createParentFolder(String filename) throws Exception { // Check for parent folder FileObject parentfolder = null; try { // Get parent folder parentfolder = getFileObject(filename).getParent(); if (parentfolder.exists()) { if (isDetailed()) { logDetailed(BaseMessages.getString(PKG, "AvroOutput.Log.ParentFolderExist", parentfolder.getName())); } } else { if (isDetailed()) { logDetailed(BaseMessages.getString(PKG, "AvroOutput.Log.ParentFolderNotExist", parentfolder.getName())); } if (meta.getCreateParentFolder()) { parentfolder.createFolder(); if (isDetailed()) { logDetailed(BaseMessages.getString(PKG, "AvroOutput.Log.ParentFolderCreated", parentfolder.getName())); } } else { throw new KettleException(BaseMessages.getString(PKG, "AvroOutput.Log.ParentFolderNotExistCreateIt", parentfolder.getName(), filename)); } } } finally { if (parentfolder != null) { try { parentfolder.close(); } catch (Exception ex) { // Ignore } } } } protected FileObject getFileObject(String vfsFilename) throws KettleFileException { return KettleVFS.getFileObject(vfsFilename); } protected FileObject getFileObject(String vfsFilename, VariableSpace space) throws KettleFileException { return KettleVFS.getFileObject(vfsFilename, space); } protected OutputStream getOutputStream(String vfsFilename, VariableSpace space, boolean append) throws KettleFileException { return KettleVFS.getOutputStream(vfsFilename, space, append); } }