co.cask.cdap.explore.guice.ExploreRuntimeModule.java Source code

Java tutorial

Introduction

Here is the source code for co.cask.cdap.explore.guice.ExploreRuntimeModule.java

Source

/*
 * Copyright  2014-2015 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.cdap.explore.guice;

import co.cask.cdap.common.conf.CConfiguration;
import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.common.runtime.RuntimeModule;
import co.cask.cdap.data2.datafabric.dataset.RemoteDatasetFramework;
import co.cask.cdap.data2.util.hbase.HBaseTableUtilFactory;
import co.cask.cdap.explore.executor.ExploreExecutorHttpHandler;
import co.cask.cdap.explore.executor.ExploreExecutorService;
import co.cask.cdap.explore.executor.ExploreMetadataHttpHandler;
import co.cask.cdap.explore.executor.ExploreStatusHandler;
import co.cask.cdap.explore.executor.NamespacedExploreMetadataHttpHandler;
import co.cask.cdap.explore.executor.NamespacedQueryExecutorHttpHandler;
import co.cask.cdap.explore.executor.QueryExecutorHttpHandler;
import co.cask.cdap.explore.service.ExploreService;
import co.cask.cdap.explore.service.ExploreServiceUtils;
import co.cask.cdap.explore.service.hive.BaseHiveExploreService;
import co.cask.cdap.explore.service.hive.Hive14ExploreService;
import co.cask.cdap.format.RecordFormats;
import co.cask.cdap.gateway.handlers.CommonHandlers;
import co.cask.cdap.hive.datasets.DatasetStorageHandler;
import co.cask.http.HttpHandler;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.io.Files;
import com.google.inject.Exposed;
import com.google.inject.Inject;
import com.google.inject.Injector;
import com.google.inject.Module;
import com.google.inject.PrivateModule;
import com.google.inject.Provider;
import com.google.inject.Provides;
import com.google.inject.Scopes;
import com.google.inject.Singleton;
import com.google.inject.multibindings.Multibinder;
import com.google.inject.name.Named;
import com.google.inject.name.Names;
import com.google.inject.util.Modules;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.MRConfig;
import org.apache.twill.api.ClassAcceptor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;

/**
 * Guice runtime module for the explore functionality.
 */
public class ExploreRuntimeModule extends RuntimeModule {
    private static final Logger LOG = LoggerFactory.getLogger(ExploreRuntimeModule.class);

    @Override
    public Module getInMemoryModules() {
        // Turning off assertions for Hive packages, since some assertions in StandardStructObjectInspector do not work
        // when outer joins are run. It is okay to turn off Hive assertions since we assume Hive is a black-box that does
        // the right thing, and we only want to test our/our user's code.
        getClass().getClassLoader().setPackageAssertionStatus("org.apache.hadoop.hive", false);
        getClass().getClassLoader().setPackageAssertionStatus("org.apache.hive", false);
        return Modules.combine(new ExploreExecutorModule(), new ExploreLocalModule(true));
    }

    @Override
    public Module getStandaloneModules() {
        return Modules.combine(new ExploreExecutorModule(), new ExploreLocalModule(false));
    }

    @Override
    public Module getDistributedModules() {
        return Modules.combine(new ExploreExecutorModule(), new ExploreDistributedModule());
    }

    private static final class ExploreExecutorModule extends PrivateModule {

        @Override
        protected void configure() {
            Named exploreSeriveName = Names.named(Constants.Service.EXPLORE_HTTP_USER_SERVICE);
            Multibinder<HttpHandler> handlerBinder = Multibinder.newSetBinder(binder(), HttpHandler.class,
                    exploreSeriveName);
            handlerBinder.addBinding().to(NamespacedQueryExecutorHttpHandler.class);
            handlerBinder.addBinding().to(QueryExecutorHttpHandler.class);
            handlerBinder.addBinding().to(NamespacedExploreMetadataHttpHandler.class);
            handlerBinder.addBinding().to(ExploreMetadataHttpHandler.class);
            handlerBinder.addBinding().to(ExploreExecutorHttpHandler.class);
            handlerBinder.addBinding().to(ExploreStatusHandler.class);
            CommonHandlers.add(handlerBinder);

            bind(ExploreExecutorService.class).in(Scopes.SINGLETON);
            expose(ExploreExecutorService.class);
        }
    }

    private static final class ExploreLocalModule extends PrivateModule {
        private final boolean isInMemory;

        public ExploreLocalModule(boolean isInMemory) {
            this.isInMemory = isInMemory;
        }

        @Override
        protected void configure() {
            // Current version of hive used in standalone is Hive 14
            bind(ExploreService.class).annotatedWith(Names.named("explore.service.impl"))
                    .to(Hive14ExploreService.class);
            bind(ExploreService.class).toProvider(ExploreServiceProvider.class).in(Scopes.SINGLETON);
            expose(ExploreService.class);
            bind(boolean.class).annotatedWith(Names.named("explore.inmemory")).toInstance(isInMemory);

            bind(File.class).annotatedWith(Names.named(Constants.Explore.PREVIEWS_DIR_NAME))
                    .toProvider(PreviewsDirProvider.class);
        }

        private static final class PreviewsDirProvider implements Provider<File> {
            private final CConfiguration cConf;

            @Inject
            public PreviewsDirProvider(CConfiguration cConf) {
                this.cConf = cConf;
            }

            @Override
            public File get() {
                String localDirStr = cConf.get(Constants.Explore.LOCAL_DATA_DIR);
                File previewsDir = new File(localDirStr, "previewsDir");
                previewsDir.mkdirs();
                return previewsDir;
            }
        }

        @Singleton
        private static final class ExploreServiceProvider implements Provider<ExploreService> {
            private final CConfiguration cConf;
            private final Configuration hConf;
            private final ExploreService exploreService;
            private final boolean isInMemory;

            @Inject
            public ExploreServiceProvider(CConfiguration cConf, Configuration hConf,
                    @Named("explore.service.impl") ExploreService exploreService,
                    @Named("explore.inmemory") boolean isInMemory) {
                this.exploreService = exploreService;
                this.cConf = cConf;
                this.hConf = hConf;
                this.isInMemory = isInMemory;
            }

            private static final long seed = System.currentTimeMillis();

            @Override
            public ExploreService get() {
                File hiveDataDir = new File(cConf.get(Constants.Explore.LOCAL_DATA_DIR));

                // The properties set using setProperty will be included to any new HiveConf object created,
                // at the condition that the configuration is known by Hive, and so is one of the HiveConf.ConfVars
                // variables.

                System.setProperty(HiveConf.ConfVars.SCRATCHDIR.toString(),
                        new File(hiveDataDir, cConf.get(Constants.AppFabric.TEMP_DIR)).getAbsolutePath());

                // Reset hadoop tmp dir because Hive does not pick it up from hConf
                System.setProperty("hadoop.tmp.dir", hConf.get("hadoop.tmp.dir"));

                File warehouseDir = new File(cConf.get(Constants.Explore.LOCAL_DATA_DIR), "warehouse");
                File databaseDir = new File(cConf.get(Constants.Explore.LOCAL_DATA_DIR), "database");

                if (isInMemory) {
                    // This seed is required to make all tests pass when launched together, and when several of them
                    // start a hive metastore / hive server.
                    warehouseDir = new File(warehouseDir, Long.toString(seed));
                    databaseDir = new File(databaseDir, Long.toString(seed));
                }

                LOG.debug("Setting {} to {}", HiveConf.ConfVars.METASTOREWAREHOUSE.toString(),
                        warehouseDir.getAbsoluteFile());
                System.setProperty(HiveConf.ConfVars.METASTOREWAREHOUSE.toString(), warehouseDir.getAbsolutePath());

                // Set derby log location
                System.setProperty("derby.stream.error.file",
                        cConf.get(Constants.Explore.LOCAL_DATA_DIR) + File.separator + "derby.log");

                String connectUrl = String.format("jdbc:derby:;databaseName=%s;create=true",
                        databaseDir.getAbsoluteFile());
                LOG.debug("Setting {} to {}", HiveConf.ConfVars.METASTORECONNECTURLKEY.toString(), connectUrl);
                System.setProperty(HiveConf.ConfVars.METASTORECONNECTURLKEY.toString(), connectUrl);

                // Some more local mode settings
                System.setProperty(HiveConf.ConfVars.LOCALMODEAUTO.toString(), "true");
                System.setProperty(HiveConf.ConfVars.SUBMITVIACHILD.toString(), "false");
                System.setProperty(MRConfig.FRAMEWORK_NAME, "local");

                // Disable security
                // Also need to disable security by making HiveAuthFactory.loginFromKeytab a no-op, since Hive >=0.14
                // ignores the HIVE_SERVER2_AUTHENTICATION property and instead uses UserGroupInformation.isSecurityEnabled()
                // (rewrite to HiveAuthFactory.loginFromKeytab bytecode is done in ExploreServiceUtils.traceDependencies)
                System.setProperty(HiveConf.ConfVars.HIVE_SERVER2_AUTHENTICATION.toString(), "NONE");
                System.setProperty(HiveConf.ConfVars.HIVE_SERVER2_ENABLE_DOAS.toString(), "false");
                System.setProperty(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL.toString(), "false");

                return exploreService;
            }
        }
    }

    private static final class ExploreDistributedModule extends PrivateModule {
        private static final Logger LOG = LoggerFactory.getLogger(ExploreDistributedModule.class);

        @Override
        protected void configure() {
            try {
                CConfiguration cConf = CConfiguration.create();
                File tmpDir = new File(new File(cConf.get(Constants.CFG_LOCAL_DATA_DIR)),
                        cConf.get(Constants.AppFabric.TEMP_DIR)).getAbsoluteFile();
                tmpDir.mkdirs();
                setupClasspath(tmpDir);

                // Set local tmp dir to an absolute location in the twill runnable otherwise Hive complains
                String localScratchPath = System.getProperty("java.io.tmpdir") + File.separator + "hive-"
                        + System.getProperty("user.name");
                System.setProperty(HiveConf.ConfVars.LOCALSCRATCHDIR.toString(),
                        new File(localScratchPath).getAbsolutePath());
                LOG.info("Setting {} to {}", HiveConf.ConfVars.LOCALSCRATCHDIR.toString(),
                        System.getProperty(HiveConf.ConfVars.LOCALSCRATCHDIR.toString()));

                File previewDir = Files.createTempDir();
                LOG.info("Storing preview files in {}", previewDir.getAbsolutePath());
                bind(File.class).annotatedWith(Names.named(Constants.Explore.PREVIEWS_DIR_NAME))
                        .toInstance(previewDir);
            } catch (Throwable e) {
                throw Throwables.propagate(e);
            }
        }

        @Provides
        @Singleton
        @Exposed
        public ExploreService providesExploreService(Injector injector) {
            // Figure out which HiveExploreService class to load
            Class<? extends ExploreService> hiveExploreServiceCl = ExploreServiceUtils.getHiveService();
            LOG.info("Using Explore service class {}", hiveExploreServiceCl.getName());
            return injector.getInstance(hiveExploreServiceCl);
        }
    }

    private static void setupClasspath(File tmpDir) throws IOException {
        // Here we find the transitive dependencies and remove all paths that come from the boot class path -
        // those paths are not needed because the new JVM will have them in its boot class path.
        // It could even be wrong to keep them because in the target container, the boot class path may be different
        // (for example, if Hadoop uses a different Java version than CDAP).

        final Set<String> bootstrapClassPaths = ExploreServiceUtils.getBoostrapClasses();

        ClassAcceptor classAcceptor = new ClassAcceptor() {
            /* Excluding any class contained in the bootstrapClassPaths and Kryo classes and hive-exec.jar
             * We need to remove Kryo dependency in the Explore container. Spark introduced version 2.21 version of Kryo,
             * which would be normally shipped to the Explore container. Yet, Hive requires Kryo 2.22,
             * and gets it from the Hive jars - hive-exec.jar to be precise.
             * we also exclude hive jars as hive dependencies are found in job.jar.
             * */
            @Override
            public boolean accept(String className, URL classUrl, URL classPathUrl) {
                if (bootstrapClassPaths.contains(classPathUrl.getFile())
                        || className.startsWith("com.esotericsoftware.kryo")
                        || classPathUrl.getFile().contains("hive")) {
                    return false;
                }
                return true;
            }
        };

        Set<File> hBaseTableDeps = ExploreServiceUtils.traceDependencies(null, classAcceptor, tmpDir,
                HBaseTableUtilFactory.getHBaseTableUtilClass().getName());

        // Note the order of dependency jars is important so that HBase jars come first in the classpath order
        // LinkedHashSet maintains insertion order while removing duplicate entries.
        Set<File> orderedDependencies = new LinkedHashSet<>();
        orderedDependencies.addAll(hBaseTableDeps);
        orderedDependencies.addAll(ExploreServiceUtils.traceDependencies(null, classAcceptor, tmpDir,
                RemoteDatasetFramework.class.getName(), DatasetStorageHandler.class.getName(),
                RecordFormats.class.getName()));

        // Note: the class path entries need to be prefixed with "file://" for the jars to work when
        // Hive starts local map-reduce job.
        ImmutableList.Builder<String> builder = ImmutableList.builder();
        for (File dep : orderedDependencies) {
            builder.add("file://" + dep.getAbsolutePath());
        }
        List<String> orderedDependenciesStr = builder.build();

        // These dependency files need to be copied over to spark container
        System.setProperty(BaseHiveExploreService.SPARK_YARN_DIST_FILES,
                Joiner.on(',').join(Iterables.transform(orderedDependencies, new Function<File, String>() {
                    @Override
                    public String apply(File input) {
                        return input.getAbsolutePath();
                    }
                })));
        LOG.debug("Setting {} to {}", BaseHiveExploreService.SPARK_YARN_DIST_FILES,
                System.getProperty(BaseHiveExploreService.SPARK_YARN_DIST_FILES));

        // These dependency files need to be copied over to hive job container
        System.setProperty(HiveConf.ConfVars.HIVEAUXJARS.toString(), Joiner.on(',').join(orderedDependenciesStr));
        LOG.debug("Setting {} to {}", HiveConf.ConfVars.HIVEAUXJARS.toString(),
                System.getProperty(HiveConf.ConfVars.HIVEAUXJARS.toString()));

        // add hive-exec.jar to the HADOOP_CLASSPATH, which is used by the local mapreduce job launched by hive ,
        // we need to add this, otherwise when hive runs a MapRedLocalTask it cannot find
        // "org.apache.hadoop.hive.serde2.SerDe" class in its classpath.
        List<String> orderedDependenciesWithHiveJar = Lists.newArrayList(orderedDependenciesStr);
        String hiveExecJar = new JobConf(org.apache.hadoop.hive.ql.exec.Task.class).getJar();
        Preconditions.checkNotNull(hiveExecJar, "Couldn't locate hive-exec.jar to be included in HADOOP_CLASSPATH "
                + "for MapReduce jobs launched by Hive");
        orderedDependenciesWithHiveJar.add(hiveExecJar);
        LOG.debug("Added hive-exec.jar {} to HADOOP_CLASSPATH to be included for MapReduce jobs launched by Hive",
                hiveExecJar);

        //TODO: Setup HADOOP_CLASSPATH hack, more info on why this is needed, see CDAP-9
        LocalMapreduceClasspathSetter classpathSetter = new LocalMapreduceClasspathSetter(new HiveConf(),
                tmpDir.getAbsolutePath(), orderedDependenciesWithHiveJar);
        for (File jar : hBaseTableDeps) {
            classpathSetter.accept(jar.getAbsolutePath());
        }
        classpathSetter.setupClasspathScript();
    }
}