org.apache.impala.infra.tableflattener.Main.java Source code

Introduction

Here is the source code for org.apache.impala.infra.tableflattener.Main.java
Source

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.impala.infra.tableflattener;

import org.apache.avro.generic.GenericRecord;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.kitesdk.data.CompressionType;
import org.kitesdk.data.Dataset;
import org.kitesdk.data.DatasetDescriptor;
import org.kitesdk.data.Datasets;
import org.kitesdk.data.Format;
import org.kitesdk.data.Formats;
import parquet.avro.AvroSchemaConverter;
import parquet.hadoop.ParquetFileReader;
import parquet.hadoop.metadata.ParquetMetadata;

import java.io.IOException;
import java.io.PrintWriter;
import java.net.URI;
import java.nio.file.Files;
import java.nio.file.Path;

public class Main {

    Options cliOptions_;
    DatasetDescriptor datasetDescr_;

    // The dir to write the flat datasets to. The dir should either not exist or be
    // empty. The URI can either point to a local dir or an HDFS dir.
    URI outputDir_;
    CommandLine commandLine_;

    @SuppressWarnings("static-access")
    void parseArgs(String[] args) throws ParseException, IOException {
        cliOptions_ = new Options();
        cliOptions_.addOption(OptionBuilder.withLongOpt("help").create("h"));
        cliOptions_.addOption(OptionBuilder.hasArg().withLongOpt("input-data-format")
                .withDescription("The format of the input file. Ex, avro").create("f"));
        cliOptions_.addOption(OptionBuilder.hasArg().withLongOpt("input-data-compression")
                .withDescription("The compression type of the input file. Ex, snappy").create("c"));
        cliOptions_.addOption(OptionBuilder.hasArg().withLongOpt("input-schema-uri")
                .withDescription("The URI of the input file's schema. Ex, file://foo.avsc").create("s"));
        CommandLineParser parser = new PosixParser();
        commandLine_ = parser.parse(cliOptions_, args);

        if (commandLine_.hasOption("h"))
            printHelp();

        DatasetDescriptor.Builder datasetDescrBuilder = new DatasetDescriptor.Builder();

        String[] dataArgs = commandLine_.getArgs();
        if (dataArgs.length != 2) {
            printHelp("Exactly two arguments are required");
        }

        URI dataFile = URI.create(dataArgs[0]);
        outputDir_ = URI.create(dataArgs[1]);
        datasetDescrBuilder.location(dataFile);

        Format inputFormat;
        if (commandLine_.hasOption("f")) {
            inputFormat = Formats.fromString(commandLine_.getOptionValue("f"));
        } else {
            String dataFilePath = dataFile.getPath();
            if (dataFilePath == null || dataFilePath.isEmpty()) {
                printHelp("Data file URI is missing a path component: " + dataFile.toString());
            }
            String ext = FilenameUtils.getExtension(dataFilePath);
            if (ext.isEmpty()) {
                printHelp("The file format (-f) must be specified");
            }
            inputFormat = Formats.fromString(ext);
        }
        datasetDescrBuilder.format(inputFormat);

        if (commandLine_.hasOption("c")) {
            datasetDescrBuilder.compressionType(CompressionType.forName(commandLine_.getOptionValue("c")));
        }

        if (commandLine_.hasOption("s")) {
            datasetDescrBuilder.schemaUri(commandLine_.getOptionValue("s"));
        } else if (inputFormat == Formats.AVRO) {
            datasetDescrBuilder.schemaFromAvroDataFile(dataFile);
        } else if (inputFormat == Formats.PARQUET) {
            ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(new Configuration(),
                    new org.apache.hadoop.fs.Path(dataFile));
            datasetDescrBuilder
                    .schema(new AvroSchemaConverter().convert(parquetMetadata.getFileMetaData().getSchema()));
        } else {
            printHelp("A schema (-s) is required for data format " + inputFormat.getName());
        }

        datasetDescr_ = datasetDescrBuilder.build();
    }

    void printHelp() {
        printHelp("");
    }

    void printHelp(String errorMessage) {
        PrintWriter printer = new PrintWriter(errorMessage.isEmpty() ? System.out : System.err);
        if (!errorMessage.isEmpty())
            printer.println("Error: " + errorMessage + "\n");
        printer.println(
                "Usage: [options] <input uri> <output uri>\n\n" + "input uri    The URI to the input file.\n"
                        + "               Ex, file:///foo.avro or hdfs://localhost:20500/foo.avro\n"
                        + "output uri   The URI to the output directory. The dir must either not\n"
                        + "               exist or it must be empty.\n"
                        + "               Ex, file:///bar or hdfs://localhost:20500/bar\n\n" + "Options:");
        new HelpFormatter().printOptions(printer, 80, cliOptions_, 1, 3);
        printer.close();
        System.exit(errorMessage.isEmpty() ? 0 : 1);
    }

    void exec(String[] args) throws ParseException, IOException {
        Logger.getRootLogger().setLevel(Level.OFF);
        parseArgs(args);

        SchemaFlattener schemaFlattener = new SchemaFlattener(outputDir_);
        FlattenedSchema rootDataset = schemaFlattener.flatten(datasetDescr_.getSchema());

        Path tempDatasetPath = Files.createTempDirectory(null);
        try {
            Dataset<GenericRecord> srcDataset = Datasets.create("dataset:file:" + tempDatasetPath.toString(),
                    datasetDescr_);
            FileMigrator migrator = new FileMigrator();
            migrator.migrate(srcDataset, rootDataset);
        } finally {
            FileUtils.deleteDirectory(tempDatasetPath.toFile());
        }
    }

    public static void main(String[] args) throws Exception {
        new Main().exec(args);
    }
}