co.cask.hydrator.plugin.batch.source.ExcelInputReader.java Source code

Introduction

Here is the source code for co.cask.hydrator.plugin.batch.source.ExcelInputReader.java
Source

/*
 * Copyright  2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.hydrator.plugin.batch.source;

import co.cask.cdap.api.annotation.Description;
import co.cask.cdap.api.annotation.Name;
import co.cask.cdap.api.annotation.Plugin;
import co.cask.cdap.api.data.batch.Input;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.DatasetProperties;
import co.cask.cdap.api.dataset.lib.CloseableIterator;
import co.cask.cdap.api.dataset.lib.KeyValue;
import co.cask.cdap.api.dataset.lib.KeyValueTable;
import co.cask.cdap.api.dataset.table.Table;
import co.cask.cdap.etl.api.Emitter;
import co.cask.cdap.etl.api.PipelineConfigurer;
import co.cask.cdap.etl.api.batch.BatchRuntimeContext;
import co.cask.cdap.etl.api.batch.BatchSource;
import co.cask.cdap.etl.api.batch.BatchSourceContext;
import co.cask.hydrator.common.ReferencePluginConfig;
import co.cask.hydrator.common.SourceInputFormatProvider;
import co.cask.hydrator.common.batch.JobUtils;
import co.cask.hydrator.plugin.common.Properties;
import com.google.common.base.Charsets;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import org.apache.commons.collections.CollectionUtils;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;

import java.io.IOException;
import java.lang.reflect.Type;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import javax.annotation.Nullable;

/**
 *Reads excel file(s) rows and convert them to structure records.
 */
@Plugin(type = BatchSource.PLUGIN_TYPE)
@Name("ExcelInputReader")
@Description("Batch Source to read one or more excel files and convert rows to structure records using "
        + "input columns, column-mapping and field-type mapping.")
public class ExcelInputReader extends BatchSource<LongWritable, Object, StructuredRecord> {

    public static final String KEY = "key";
    public static final String FILE = "file";
    public static final String SHEET = "sheet";
    public static final String RECORD = "record";
    public static final String EXIT_ON_ERROR = "Exit on error";
    public static final String WRITE_ERROR_DATASET = "Write to error dataset";
    public static final String NULL = "NULL";
    public static final String END = "END";
    private static final Gson GSON = new Gson();
    private static final Type ARRAYLIST_PREPROCESSED_FILES = new TypeToken<ArrayList<String>>() {
    }.getType();
    private final ExcelInputReaderConfig excelInputreaderConfig;
    private Schema outputSchema;
    private Map<String, String> columnMapping = new HashMap<>();
    private Map<String, String> outputSchemaMapping = new HashMap<>();
    private List<String> inputColumns;
    private Map<String, String> outputFieldsMapping = new HashMap<>();
    private BatchRuntimeContext batchRuntimeContext;
    private Map<String, Integer> filePrevRowNumMap = new HashMap<>();

    public ExcelInputReader(ExcelInputReaderConfig excelReaderConfig) {
        this.excelInputreaderConfig = excelReaderConfig;
    }

    private Schema errorRecordSchema = Schema.recordOf("schema",
            Schema.Field.of(KEY, Schema.of(Schema.Type.STRING)),
            Schema.Field.of(FILE, Schema.of(Schema.Type.STRING)),
            Schema.Field.of(SHEET, Schema.of(Schema.Type.STRING)),
            Schema.Field.of(RECORD, Schema.of(Schema.Type.STRING)));

    @Override
    public void initialize(BatchRuntimeContext context) throws Exception {
        batchRuntimeContext = context;
        init();
    }

    /**
     * Initialize and set maps from input config object
     */
    private void init() {
        if (!Strings.isNullOrEmpty(excelInputreaderConfig.columnList)) {
            String[] columnsList = excelInputreaderConfig.columnList.split(",");
            inputColumns = Arrays.asList(columnsList);
        }

        if (!Strings.isNullOrEmpty(excelInputreaderConfig.columnMapping)) {
            String[] mappings = excelInputreaderConfig.columnMapping.split(",");
            for (String map : mappings) {
                String[] columns = map.split(":");
                if (CollectionUtils.isNotEmpty(inputColumns) && !inputColumns.contains(columns[0])) {
                    throw new IllegalArgumentException("Column name: " + columns[0]
                            + " in 'Column-Label Mapping' does not "
                            + "match the columns in the 'Column To Be Extracted' input text box. "
                            + "It has to be one of the columns in 'Column To Be Extracted' " + "input text box.");
                }
                columnMapping.put(columns[0], columns[1]);
            }
        }

        if (!Strings.isNullOrEmpty(excelInputreaderConfig.outputSchema)) {
            String[] schemaList = excelInputreaderConfig.outputSchema.split(",");
            for (String schema : schemaList) {
                String[] columns = schema.split(":");
                if (CollectionUtils.isNotEmpty(inputColumns) && !inputColumns.contains(columns[0])) {
                    throw new IllegalArgumentException(
                            "Column name: " + columns[0] + " in 'Field Name Schema Type Mapping'"
                                    + " does not match the columns in the 'Column To Be Extracted' input "
                                    + "text box. It has to be one of the columns in "
                                    + "'Column To Be Extracted' input text box.");
                }
                outputSchemaMapping.put(columns[0], columns[1]);
            }
        }
    }

    @Override
    public void transform(KeyValue<LongWritable, Object> input, Emitter<StructuredRecord> emitter)
            throws Exception {

        getOutputSchema();
        StructuredRecord.Builder builder = StructuredRecord.builder(outputSchema);
        String inputValue = input.getValue().toString();

        String[] excelRecords = inputValue.split("\t");

        String fileName = excelRecords[1];
        String sheetName = excelRecords[2];
        String ifEndRow = excelRecords[3];

        int prevRowNum = Integer.parseInt(excelRecords[0]);

        if (filePrevRowNumMap.containsKey(fileName)) {
            if (prevRowNum - filePrevRowNumMap.get(fileName) > 1
                    && excelInputreaderConfig.terminateIfEmptyRow.equalsIgnoreCase("true")) {
                throw new ExecutionException(
                        "Encountered empty row while reading Excel file :" + fileName + " . Terminating processing",
                        new Throwable());
            }
        }

        filePrevRowNumMap.put(fileName, prevRowNum);

        Map<String, String> excelColumnValueMap = new HashMap<>();

        for (String columns : excelRecords) {
            String[] columnValueArray = columns.split("\r");
            if (columnValueArray.length > 1) {
                String columnName = columnValueArray[0];
                String columnValue = columnValueArray[1];

                if (columnMapping.containsKey(columnName)) {
                    excelColumnValueMap.put(columnMapping.get(columnName), columnValue);
                } else {
                    excelColumnValueMap.put(columnName, columnValue);
                }
            }
        }

        try {
            for (Schema.Field field : outputSchema.getFields()) {
                String fieldName = field.getName();
                if (excelColumnValueMap.containsKey(fieldName)) {
                    builder.convertAndSet(fieldName, excelColumnValueMap.get(fieldName));
                } else {
                    builder.set(fieldName, NULL);
                }
            }

            builder.set(FILE, new Path(fileName).getName());
            builder.set(SHEET, sheetName);

            if (ifEndRow.equalsIgnoreCase(END)) {
                trackProcessedFiles(fileName);
            }

            emitter.emit(builder.build());
        } catch (Exception e) {
            switch (excelInputreaderConfig.ifErrorRecord) {
            case EXIT_ON_ERROR:
                throw new IllegalStateException("Terminating processing on error : " + e.getMessage());
            case WRITE_ERROR_DATASET:
                StructuredRecord.Builder errorRecordBuilder = StructuredRecord.builder(errorRecordSchema);
                errorRecordBuilder.set(KEY, fileName + "_" + sheetName + "_" + excelRecords[0]);
                errorRecordBuilder.set(FILE, fileName);
                errorRecordBuilder.set(SHEET, sheetName);
                errorRecordBuilder.set(RECORD, inputValue);
                writeErrorRecordToDataset(errorRecordBuilder.build());
                break;
            default:
                //ignore on error
                break;
            }

        }

    }

    /**
     * Writes proceesed file name to memory table.
     * @param fileName
     */
    private void trackProcessedFiles(String fileName) {
        KeyValueTable processedFileMemoryTable = batchRuntimeContext
                .getDataset(excelInputreaderConfig.memoryTableName);
        processedFileMemoryTable.write(fileName, String.valueOf(System.currentTimeMillis()));
    }

    /**
     * Writes error record to error dataset.
     * @param errorRecord in the form of {@link StructuredRecord}
     * throws {@link IOException}
     */
    private void writeErrorRecordToDataset(StructuredRecord errorRecord) throws IOException {
        Table errorTable = batchRuntimeContext.getDataset(excelInputreaderConfig.errorDatasetName);
        errorTable.write(errorRecord);
    }

    /**
     * Returns list of all the processed file names which are kept in memory table.
     * @param batchSourceContext
     * @return processedFiles
     */
    private List<String> getAllProcessedFiles(BatchSourceContext batchSourceContext) {
        KeyValueTable table = batchSourceContext.getDataset(excelInputreaderConfig.memoryTableName);
        List<String> processedFiles = new ArrayList<>();
        try (CloseableIterator<KeyValue<byte[], byte[]>> filesIterable = table.scan(null, null)) {
            while (filesIterable.hasNext()) {
                KeyValue<byte[], byte[]> file = filesIterable.next();
                processedFiles.add(new String(file.getKey(), Charsets.UTF_8));
            }
        }
        return processedFiles;
    }

    @Override
    public void configurePipeline(PipelineConfigurer pipelineConfigurer) {

        super.configurePipeline(pipelineConfigurer);

        if (excelInputreaderConfig.sheetNo < 0 && Strings.isNullOrEmpty(excelInputreaderConfig.sheetName)) {
            throw new IllegalArgumentException(
                    "Sheet name cannot be empty when the value of 'Sheet Number' " + "input is less than 0.");
        }

        if (Strings.isNullOrEmpty(excelInputreaderConfig.columnList)
                && Strings.isNullOrEmpty(excelInputreaderConfig.outputSchema)) {
            throw new IllegalArgumentException(
                    "'Field Name Schema Type Mapping' input cannot be empty when the empty "
                            + "input value of 'Columns To Be Extracted' is provided.");
        }

        try {
            if (!Strings.isNullOrEmpty(excelInputreaderConfig.errorDatasetName)) {
                Map<String, String> properties = new HashMap<>();
                properties.put(Properties.Table.PROPERTY_SCHEMA, errorRecordSchema.toString());
                properties.put(Properties.Table.PROPERTY_SCHEMA_ROW_FIELD, KEY);
                DatasetProperties datasetProperties = DatasetProperties.builder().addAll(properties).build();

                pipelineConfigurer.createDataset(excelInputreaderConfig.errorDatasetName, Table.class,
                        datasetProperties);

            } else if (excelInputreaderConfig.ifErrorRecord.equalsIgnoreCase(WRITE_ERROR_DATASET)) {
                throw new IllegalArgumentException(
                        "Error dataset name should not be empty while choosing write to error "
                                + "dataset for 'On Error' input.");
            }

            pipelineConfigurer.createDataset(excelInputreaderConfig.memoryTableName, KeyValueTable.class);
        } catch (Exception e) {
            throw new IllegalStateException("Exception while creating dataset.", e);
        }

        init();
        getOutputSchema();
        pipelineConfigurer.getStageConfigurer().setOutputSchema(outputSchema);

    }

    @Override
    public void prepareRun(BatchSourceContext batchSourceContext) throws Exception {

        Job job = JobUtils.createInstance();

        String processFiles = GSON.toJson(getAllProcessedFiles(batchSourceContext), ARRAYLIST_PREPROCESSED_FILES);

        ExcelInputFormat.setConfigurations(job, excelInputreaderConfig.filePattern,
                excelInputreaderConfig.sheetName, excelInputreaderConfig.reprocess, excelInputreaderConfig.sheetNo,
                excelInputreaderConfig.columnList, excelInputreaderConfig.skipFirstRow,
                excelInputreaderConfig.terminateIfEmptyRow, excelInputreaderConfig.rowsLimit,
                excelInputreaderConfig.ifErrorRecord, processFiles);

        // Sets the input path(s).
        ExcelInputFormat.addInputPaths(job, excelInputreaderConfig.filePath);

        // Sets the filter based on extended class implementation.
        ExcelInputFormat.setInputPathFilter(job, ExcelReaderRegexFilter.class);
        SourceInputFormatProvider inputFormatProvider = new SourceInputFormatProvider(ExcelInputFormat.class,
                job.getConfiguration());
        batchSourceContext.setInput(Input.of(excelInputreaderConfig.referenceName, inputFormatProvider));

    }

    /**
     * Get the output schema from the Excel Input Reader specified by the user.
     * @return outputSchema
     */
    private void getOutputSchema() {
        if (outputSchema == null) {
            List<Schema.Field> outputFields = Lists.newArrayList();
            outputFields.add(Schema.Field.of(FILE, Schema.of(Schema.Type.STRING)));
            outputFields.add(Schema.Field.of(SHEET, Schema.of(Schema.Type.STRING)));
            try {
                // If processing of all the columns are required
                if (inputColumns == null || inputColumns.isEmpty()) {
                    for (String fieldName : outputSchemaMapping.keySet()) {
                        String columnName = fieldName;
                        if (columnMapping.containsKey(fieldName)) {
                            columnName = columnMapping.get(fieldName);
                        }

                        Schema fieldType = Schema
                                .of(Schema.Type.valueOf(outputSchemaMapping.get(fieldName).toUpperCase()));
                        outputFields.add(Schema.Field.of(columnName, fieldType));
                    }
                } else {
                    for (String column : inputColumns) {
                        String columnName = column;
                        if (columnMapping.containsKey(column)) {
                            columnName = columnMapping.get(column);
                        }
                        if (outputSchemaMapping.containsKey(column)) {

                            Schema fieldType = Schema
                                    .of(Schema.Type.valueOf(outputSchemaMapping.get(column).toUpperCase()));
                            outputFields.add(Schema.Field.of(columnName, fieldType));
                        } else {
                            outputFields.add(Schema.Field.of(columnName, Schema.of(Schema.Type.STRING)));
                        }

                        outputFieldsMapping.put(column, columnName);
                    }
                }
            } catch (Exception e) {
                throw new IllegalArgumentException("Exception while creating output schema for Excel input reader. "
                        + "Invalid output " + "schema: " + e.getMessage(), e);
            }
            outputSchema = Schema.recordOf("outputSchema", outputFields);
        }
    }

    /**
     * Config class for ExcelInputReader.
     */
    public static class ExcelInputReaderConfig extends ReferencePluginConfig {

        @Name("filePath")
        @Description("Path of the excel file(s) to be read; for example: 'file:///home/cdap' for a "
                + "local file and 'hdfs://10.222.73.37:9000/tmp' for a file in hdfs.")
        private String filePath;

        @Name("filePattern")
        @Description("Regex pattern to select specific file(s); for example: '.*'")
        private String filePattern;

        @Name("memoryTableName")
        @Description("KeyValue table name to keep the track of processed files. This can be a new table or existing one;"
                + " for example: 'inventory-memory-table'")
        private String memoryTableName;

        @Name("reprocess")
        @Description("Specifies whether the file(s) should be reprocessed. "
                + "Options to select are true or false")
        private boolean reprocess;

        @Nullable
        @Name("sheetName")
        @Description("Name of the sheet in excel file(s) to be processed; for example: 'Sheet1'.")
        private String sheetName;

        @Name("sheetNo")
        @Description("Sheet number to be processed in the excel file; for example: '1'.")
        private int sheetNo;

        @Nullable
        @Name("columnList")
        @Description("Specify the excel column names which needs to be extracted from the excel sheet; for example: 'A,B'.")
        private String columnList;

        @Nullable
        @Name("columnMapping")
        @Description("List of the excel column name to be renamed. The key specifies the name of the column to rename, "
                + "with its corresponding value specifying the new name for that column names; for example A:id,B:name")
        private String columnMapping;

        @Name("skipFirstRow")
        @Description("Specify whether first row in the excel sheet is to be skipped or not. "
                + "Options to select are true or false.")
        private boolean skipFirstRow;

        @Name("terminateIfEmptyRow")
        @Description("Specify whether processing needs to be terminated in case an empty row is encountered "
                + "while processing excel files. Options to select are true or false.")
        private String terminateIfEmptyRow;

        @Nullable
        @Name("rowsLimit")
        @Description("Specify maximum number of rows to be processed for each sheet; for example: '100'.")
        private String rowsLimit;

        @Nullable
        @Name("outputSchema")
        @Description("Comma separated mapping of column names in the output schema to the data types;"
                + "for example: 'A:string,B:int'. This input is required if no inputs "
                + "for 'columnList' has been provided.")
        private String outputSchema;

        @Name("ifErrorRecord")
        @Description("Specifies the action to be taken in case if an error occurs. Shift 'Options are' in next line: "
                + "Ignore error and continue" + "Exit on error: Stops processing upon encountering an error"
                + "Write to error dataset:  Writes the error record to an error dataset and continues processing.")
        private String ifErrorRecord;

        @Nullable
        @Name("errorDatasetName")
        @Description("Name of the table to store error record; for example: 'error-table-name'.")
        private String errorDatasetName;

        public ExcelInputReaderConfig() {
            super(String.format("ExcelInputReader"));
        }

    }
}