org.apache.streams.plugins.hive.StreamsHiveResourceGenerator.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.streams.plugins.hive.StreamsHiveResourceGenerator.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.streams.plugins.hive;

import org.apache.streams.util.schema.FieldType;
import org.apache.streams.util.schema.FieldUtil;
import org.apache.streams.util.schema.FileUtil;
import org.apache.streams.util.schema.Schema;
import org.apache.streams.util.schema.SchemaStore;
import org.apache.streams.util.schema.SchemaStoreImpl;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import org.apache.commons.lang3.StringUtils;
import org.jsonschema2pojo.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import static org.apache.streams.util.schema.FileUtil.dropExtension;
import static org.apache.streams.util.schema.FileUtil.dropSourcePathPrefix;
import static org.apache.streams.util.schema.FileUtil.swapExtension;
import static org.apache.streams.util.schema.FileUtil.writeFile;

/**
 * Generates hive table definitions for using org.openx.data.jsonserde.JsonSerDe on new-line delimited json documents.
 *
 *
 */
public class StreamsHiveResourceGenerator implements Runnable {

    private static final Logger LOGGER = LoggerFactory.getLogger(StreamsHiveResourceGenerator.class);

    private static final String LS = System.getProperty("line.separator");

    private StreamsHiveGenerationConfig config;

    private SchemaStore schemaStore = new SchemaStoreImpl();

    private int currentDepth = 0;

    /**
     * Run from CLI without Maven
     *
     * <p/>
     * java -jar streams-plugin-hive-jar-with-dependencies.jar StreamsHiveResourceGenerator src/main/jsonschema target/generated-resources
     *
     * @param args [sourceDirectory, targetDirectory]
     * */
    public static void main(String[] args) {
        StreamsHiveGenerationConfig config = new StreamsHiveGenerationConfig();

        String sourceDirectory = "src/main/jsonschema";
        String targetDirectory = "target/generated-resources/hive";

        if (args.length > 0) {
            sourceDirectory = args[0];
        }
        if (args.length > 1) {
            targetDirectory = args[1];
        }

        config.setSourceDirectory(sourceDirectory);
        config.setTargetDirectory(targetDirectory);

        StreamsHiveResourceGenerator streamsHiveResourceGenerator = new StreamsHiveResourceGenerator(config);
        streamsHiveResourceGenerator.run();
    }

    public StreamsHiveResourceGenerator(StreamsHiveGenerationConfig config) {
        this.config = config;
    }

    @Override
    public void run() {

        Objects.requireNonNull(config);

        generate(config);

    }

    /**
     * run generate using supplied StreamsHiveGenerationConfig.
     * @param config StreamsHiveGenerationConfig
     */
    public void generate(StreamsHiveGenerationConfig config) {

        LinkedList<File> sourceFiles = new LinkedList<>();

        for (Iterator<URL> sources = config.getSource(); sources.hasNext();) {
            URL source = sources.next();
            sourceFiles.add(URLUtil.getFileFromURL(source));
        }

        LOGGER.info("Seeded with {} source paths:", sourceFiles.size());

        FileUtil.resolveRecursive(config, sourceFiles);

        LOGGER.info("Resolved {} schema files:", sourceFiles.size());

        for (File item : sourceFiles) {
            schemaStore.create(item.toURI());
        }

        LOGGER.info("Identified {} objects:", schemaStore.getSize());

        for (Iterator<Schema> schemaIterator = schemaStore.getSchemaIterator(); schemaIterator.hasNext();) {
            Schema schema = schemaIterator.next();
            currentDepth = 0;
            if (schema.getUri().getScheme().equals("file")) {
                String inputFile = schema.getUri().getPath();
                String resourcePath = dropSourcePathPrefix(inputFile, config.getSourceDirectory());
                for (String sourcePath : config.getSourcePaths()) {
                    resourcePath = dropSourcePathPrefix(resourcePath, sourcePath);
                }
                String outputFile = config.getTargetDirectory() + "/" + swapExtension(resourcePath, "json", "hql");

                LOGGER.info("Processing {}:", resourcePath);

                String resourceId = dropExtension(resourcePath).replace("/", "_");

                String resourceContent = generateResource(schema, resourceId);

                writeFile(outputFile, resourceContent);

                LOGGER.info("Wrote {}:", outputFile);
            }
        }
    }

    /**
     * generateResource String from schema and resourceId.
     * @param schema Schema
     * @param resourceId String
     * @return CREATE TABLE ...
     */
    public String generateResource(Schema schema, String resourceId) {
        StringBuilder resourceBuilder = new StringBuilder();
        resourceBuilder.append("CREATE TABLE ");
        resourceBuilder.append(hqlEscape(resourceId));
        resourceBuilder.append(LS);
        resourceBuilder.append("(");
        resourceBuilder.append(LS);
        resourceBuilder = appendRootObject(resourceBuilder, schema, resourceId, ' ');
        resourceBuilder.append(")");
        resourceBuilder.append(LS);
        resourceBuilder.append("ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'");
        resourceBuilder.append(LS);
        resourceBuilder.append("WITH SERDEPROPERTIES (\"ignore.malformed.json\" = \"true\"");
        resourceBuilder.append(LS);
        resourceBuilder.append("STORED AS TEXTFILE");
        resourceBuilder.append(LS);
        resourceBuilder.append("LOCATION '${hiveconf:path}';");
        resourceBuilder.append(LS);
        return resourceBuilder.toString();
    }

    protected StringBuilder appendRootObject(StringBuilder builder, Schema schema, String resourceId,
            Character seperator) {
        ObjectNode propertiesNode = schemaStore.resolveProperties(schema, null, resourceId);
        if (propertiesNode != null && propertiesNode.isObject() && propertiesNode.size() > 0) {
            builder = appendPropertiesNode(builder, schema, propertiesNode, seperator);
        }
        return builder;
    }

    private StringBuilder appendValueField(StringBuilder builder, Schema schema, String fieldId,
            FieldType fieldType, Character seperator) {
        // safe to append nothing
        Objects.requireNonNull(builder);
        builder.append(hqlEscape(fieldId));
        builder.append(seperator);
        builder.append(hqlType(fieldType));
        return builder;
    }

    protected StringBuilder appendArrayItems(StringBuilder builder, Schema schema, String fieldId,
            ObjectNode itemsNode, Character seperator) {
        // not safe to append nothing
        Objects.requireNonNull(builder);
        if (itemsNode == null) {
            return builder;
        }
        if (itemsNode.has("type")) {
            try {
                FieldType itemType = FieldUtil.determineFieldType(itemsNode);
                switch (itemType) {
                case OBJECT:
                    builder = appendArrayObject(builder, schema, fieldId, itemsNode, seperator);
                    break;
                case ARRAY:
                    ObjectNode subArrayItems = (ObjectNode) itemsNode.get("items");
                    builder = appendArrayItems(builder, schema, fieldId, subArrayItems, seperator);
                    break;
                default:
                    builder = appendArrayField(builder, schema, fieldId, itemType, seperator);
                }
            } catch (Exception ex) {
                LOGGER.warn("No item type resolvable for {}", fieldId);
            }
        }
        Objects.requireNonNull(builder);
        return builder;
    }

    private StringBuilder appendArrayField(StringBuilder builder, Schema schema, String fieldId,
            FieldType fieldType, Character seperator) {
        // safe to append nothing
        Objects.requireNonNull(builder);
        Objects.requireNonNull(fieldId);
        builder.append(hqlEscape(fieldId));
        builder.append(seperator);
        builder.append("ARRAY<" + hqlType(fieldType) + ">");
        Objects.requireNonNull(builder);
        return builder;
    }

    private StringBuilder appendArrayObject(StringBuilder builder, Schema schema, String fieldId,
            ObjectNode fieldNode, Character seperator) {
        // safe to append nothing
        Objects.requireNonNull(builder);
        Objects.requireNonNull(fieldNode);
        if (StringUtils.isNotBlank(fieldId)) {
            builder.append(hqlEscape(fieldId));
            builder.append(seperator);
        }
        builder.append("ARRAY");
        builder.append(LS);
        builder.append("<");
        builder.append(LS);
        ObjectNode propertiesNode = schemaStore.resolveProperties(schema, fieldNode, fieldId);
        builder = appendStructField(builder, schema, "", propertiesNode, ':');
        builder.append(">");
        Objects.requireNonNull(builder);
        return builder;
    }

    private StringBuilder appendStructField(StringBuilder builder, Schema schema, String fieldId,
            ObjectNode propertiesNode, Character seperator) {
        // safe to append nothing
        Objects.requireNonNull(builder);
        Objects.requireNonNull(propertiesNode);

        if (propertiesNode != null && propertiesNode.isObject() && propertiesNode.size() > 0) {

            currentDepth += 1;

            if (StringUtils.isNotBlank(fieldId)) {
                builder.append(hqlEscape(fieldId));
                builder.append(seperator);
            }
            builder.append("STRUCT");
            builder.append(LS);
            builder.append("<");
            builder.append(LS);

            builder = appendPropertiesNode(builder, schema, propertiesNode, ':');

            builder.append(">");
            builder.append(LS);

            currentDepth -= 1;

        }
        Objects.requireNonNull(builder);
        return builder;
    }

    private StringBuilder appendPropertiesNode(StringBuilder builder, Schema schema, ObjectNode propertiesNode,
            Character seperator) {
        Objects.requireNonNull(builder);
        Objects.requireNonNull(propertiesNode);
        Iterator<Map.Entry<String, JsonNode>> fields = propertiesNode.fields();
        List<String> fieldStrings = new ArrayList<>();
        for (; fields.hasNext();) {
            Map.Entry<String, JsonNode> field = fields.next();
            String fieldId = field.getKey();
            if (!config.getExclusions().contains(fieldId) && field.getValue().isObject()) {
                ObjectNode fieldNode = (ObjectNode) field.getValue();
                FieldType fieldType = FieldUtil.determineFieldType(fieldNode);
                if (fieldType != null) {
                    switch (fieldType) {
                    case ARRAY:
                        ObjectNode itemsNode = (ObjectNode) fieldNode.get("items");
                        if (currentDepth <= config.getMaxDepth()) {
                            StringBuilder arrayItemsBuilder = appendArrayItems(new StringBuilder(), schema, fieldId,
                                    itemsNode, seperator);
                            if (StringUtils.isNotBlank(arrayItemsBuilder.toString())) {
                                fieldStrings.add(arrayItemsBuilder.toString());
                            }
                        }
                        break;
                    case OBJECT:
                        ObjectNode childProperties = schemaStore.resolveProperties(schema, fieldNode, fieldId);
                        if (currentDepth < config.getMaxDepth()) {
                            StringBuilder structFieldBuilder = appendStructField(new StringBuilder(), schema,
                                    fieldId, childProperties, seperator);
                            if (StringUtils.isNotBlank(structFieldBuilder.toString())) {
                                fieldStrings.add(structFieldBuilder.toString());
                            }
                        }
                        break;
                    default:
                        StringBuilder valueFieldBuilder = appendValueField(new StringBuilder(), schema, fieldId,
                                fieldType, seperator);
                        if (StringUtils.isNotBlank(valueFieldBuilder.toString())) {
                            fieldStrings.add(valueFieldBuilder.toString());
                        }
                    }
                }
            }
        }
        builder.append(String.join("," + LS, fieldStrings)).append(LS);
        Objects.requireNonNull(builder);
        return builder;
    }

    private static String hqlEscape(String fieldId) {
        return "`" + fieldId + "`";
    }

    private static String hqlType(FieldType fieldType) {
        switch (fieldType) {
        case INTEGER:
            return "INT";
        case NUMBER:
            return "FLOAT";
        case OBJECT:
            return "STRUCT";
        default:
            return fieldType.name().toUpperCase();
        }
    }

}