Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.sqoop.mapreduce; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.security.token.delegation.DelegationTokenIdentifier; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.token.Token; import org.apache.sqoop.hive.HiveConfig; import org.kitesdk.data.CompressionType; import org.kitesdk.data.Dataset; import org.kitesdk.data.DatasetDescriptor; import org.kitesdk.data.Datasets; import org.kitesdk.data.Formats; import org.kitesdk.data.mapreduce.DatasetKeyOutputFormat; import org.kitesdk.data.spi.SchemaValidationUtil; import java.io.IOException; import java.lang.reflect.Method; /** * Helper class for setting up a Parquet MapReduce job. */ public final class ParquetJob { public static final Log LOG = LogFactory.getLog(ParquetJob.class.getName()); public static final String HIVE_METASTORE_CLIENT_CLASS = "org.apache.hadoop.hive.metastore.HiveMetaStoreClient"; public static final String HIVE_METASTORE_SASL_ENABLED = "hive.metastore.sasl.enabled"; // Purposefully choosing the same token alias as the one Oozie chooses. // Make sure we don't generate a new delegation token if oozie // has already generated one. public static final String HIVE_METASTORE_TOKEN_ALIAS = "HCat Token"; private ParquetJob() { } private static final String CONF_AVRO_SCHEMA = "parquetjob.avro.schema"; static final String CONF_OUTPUT_CODEC = "parquetjob.output.codec"; enum WriteMode { DEFAULT, APPEND, OVERWRITE }; public static Schema getAvroSchema(Configuration conf) { return new Schema.Parser().parse(conf.get(CONF_AVRO_SCHEMA)); } public static CompressionType getCompressionType(Configuration conf) { CompressionType defaults = Formats.PARQUET.getDefaultCompressionType(); String codec = conf.get(CONF_OUTPUT_CODEC, defaults.getName()); try { return CompressionType.forName(codec); } catch (IllegalArgumentException ex) { LOG.warn(String.format("Unsupported compression type '%s'. Fallback to '%s'.", codec, defaults)); } return defaults; } /** * Configure the import job. The import process will use a Kite dataset to * write data records into Parquet format internally. The input key class is * {@link org.apache.sqoop.lib.SqoopRecord}. The output key is * {@link org.apache.avro.generic.GenericRecord}. */ public static void configureImportJob(JobConf conf, Schema schema, String uri, WriteMode writeMode) throws IOException { Dataset dataset; // Add hive delegation token only if we don't already have one. if (uri.startsWith("dataset:hive")) { Configuration hiveConf = HiveConfig.getHiveConf(conf); if (isSecureMetastore(hiveConf)) { // Copy hive configs to job config HiveConfig.addHiveConfigs(hiveConf, conf); if (conf.getCredentials().getToken(new Text(HIVE_METASTORE_TOKEN_ALIAS)) == null) { addHiveDelegationToken(conf); } } } if (Datasets.exists(uri)) { if (WriteMode.DEFAULT.equals(writeMode)) { throw new IOException("Destination exists! " + uri); } dataset = Datasets.load(uri); Schema writtenWith = dataset.getDescriptor().getSchema(); if (!SchemaValidationUtil.canRead(writtenWith, schema)) { throw new IOException(String.format("Expected schema: %s%nActual schema: %s", writtenWith, schema)); } } else { dataset = createDataset(schema, getCompressionType(conf), uri); } conf.set(CONF_AVRO_SCHEMA, schema.toString()); DatasetKeyOutputFormat.ConfigBuilder builder = DatasetKeyOutputFormat.configure(conf); if (WriteMode.OVERWRITE.equals(writeMode)) { builder.overwrite(dataset); } else if (WriteMode.APPEND.equals(writeMode)) { builder.appendTo(dataset); } else { builder.writeTo(dataset); } } private static Dataset createDataset(Schema schema, CompressionType compressionType, String uri) { DatasetDescriptor descriptor = new DatasetDescriptor.Builder().schema(schema).format(Formats.PARQUET) .compressionType(compressionType).build(); return Datasets.create(uri, descriptor, GenericRecord.class); } private static boolean isSecureMetastore(Configuration conf) { return conf != null && conf.getBoolean(HIVE_METASTORE_SASL_ENABLED, false); } /** * Add hive delegation token to credentials store. * @param conf */ private static void addHiveDelegationToken(JobConf conf) { // Need to use reflection since there's no compile time dependency on the client libs. Class<?> HiveConfClass; Class<?> HiveMetaStoreClientClass; try { HiveMetaStoreClientClass = Class.forName(HIVE_METASTORE_CLIENT_CLASS); } catch (ClassNotFoundException ex) { LOG.error("Could not load " + HIVE_METASTORE_CLIENT_CLASS + " when adding hive delegation token. " + "Make sure HIVE_CONF_DIR is set correctly.", ex); throw new RuntimeException("Couldn't fetch delegation token.", ex); } try { HiveConfClass = Class.forName(HiveConfig.HIVE_CONF_CLASS); } catch (ClassNotFoundException ex) { LOG.error("Could not load " + HiveConfig.HIVE_CONF_CLASS + " when adding hive delegation token." + " Make sure HIVE_CONF_DIR is set correctly.", ex); throw new RuntimeException("Couldn't fetch delegation token.", ex); } try { Object client = HiveMetaStoreClientClass.getConstructor(HiveConfClass).newInstance(HiveConfClass .getConstructor(Configuration.class, Class.class).newInstance(conf, Configuration.class)); // getDelegationToken(String kerberosPrincial) Method getDelegationTokenMethod = HiveMetaStoreClientClass.getMethod("getDelegationToken", String.class); Object tokenStringForm = getDelegationTokenMethod.invoke(client, UserGroupInformation.getLoginUser().getShortUserName()); // Load token Token<DelegationTokenIdentifier> metastoreToken = new Token<DelegationTokenIdentifier>(); metastoreToken.decodeFromUrlString(tokenStringForm.toString()); conf.getCredentials().addToken(new Text(HIVE_METASTORE_TOKEN_ALIAS), metastoreToken); LOG.debug("Successfully fetched hive metastore delegation token. " + metastoreToken); } catch (Exception ex) { LOG.error("Couldn't fetch delegation token.", ex); throw new RuntimeException("Couldn't fetch delegation token.", ex); } } }