Java tutorial
/** * Copyright [2012-2014] PayPal Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ml.shifu.shifu.core.processor; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Scanner; import ml.shifu.guagua.GuaguaConstants; import ml.shifu.guagua.mapreduce.GuaguaMapReduceClient; import ml.shifu.guagua.mapreduce.GuaguaMapReduceConstants; import ml.shifu.shifu.container.obj.ColumnConfig; import ml.shifu.shifu.container.obj.ModelBasicConf.RunMode; import ml.shifu.shifu.container.obj.RawSourceData.SourceType; import ml.shifu.shifu.core.AbstractTrainer; import ml.shifu.shifu.core.VariableSelector; import ml.shifu.shifu.core.alg.NNTrainer; import ml.shifu.shifu.core.dtrain.NNConstants; import ml.shifu.shifu.core.dvarsel.VarSelMaster; import ml.shifu.shifu.core.dvarsel.VarSelMasterResult; import ml.shifu.shifu.core.dvarsel.VarSelOutput; import ml.shifu.shifu.core.dvarsel.VarSelWorker; import ml.shifu.shifu.core.dvarsel.VarSelWorkerResult; import ml.shifu.shifu.core.dvarsel.wrapper.CandidateGenerator; import ml.shifu.shifu.core.dvarsel.wrapper.WrapperMasterConductor; import ml.shifu.shifu.core.dvarsel.wrapper.WrapperWorkerConductor; import ml.shifu.shifu.core.mr.input.CombineInputFormat; import ml.shifu.shifu.core.validator.ModelInspector.ModelStep; import ml.shifu.shifu.core.varselect.ColumnInfo; import ml.shifu.shifu.core.varselect.VarSelectMapper; import ml.shifu.shifu.core.varselect.VarSelectReducer; import ml.shifu.shifu.exception.ShifuErrorCode; import ml.shifu.shifu.exception.ShifuException; import ml.shifu.shifu.fs.PathFinder; import ml.shifu.shifu.fs.ShifuFileUtils; import ml.shifu.shifu.util.CommonUtils; import ml.shifu.shifu.util.Constants; import ml.shifu.shifu.util.Environment; import ml.shifu.shifu.util.HDPUtils; import org.apache.commons.collections.ListUtils; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.jexl2.JexlException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.map.MultithreadedMapper; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.pig.impl.util.JarManager; import org.apache.zookeeper.ZooKeeper; import org.encog.ml.data.MLDataSet; import org.jboss.netty.bootstrap.ServerBootstrap; import org.mortbay.log.Log; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Splitter; /** * Variable selection processor, select the variable based on KS/IV value, or </p> * * <p> * Selection variable based on the wrapper training processor. * * <p> * For sensitive variable selection, each time wrapperRatio percent of variables will be removed. If continue do * variable selection, continue to run varselect command. Current design will do variable selection continuously. */ public class VarSelectModelProcessor extends BasicModelProcessor implements Processor { private final static Logger log = LoggerFactory.getLogger(VarSelectModelProcessor.class); /** * Run for the variable selection */ @Override public int run() throws Exception { log.info("Step Start: varselect"); long start = System.currentTimeMillis(); setUp(ModelStep.VARSELECT); validateNormalize(); syncDataToHdfs(super.modelConfig.getDataSet().getSource()); VariableSelector selector = new VariableSelector(this.modelConfig, this.columnConfigList); if (!modelConfig.getVarSelectWrapperEnabled()) { // Select by local KS, IV CommonUtils.updateColumnConfigFlags(modelConfig, columnConfigList); this.columnConfigList = selector.selectByFilter(); try { this.saveColumnConfigListAndColumnStats(); } catch (ShifuException e) { throw new ShifuException(ShifuErrorCode.ERROR_WRITE_COLCONFIG, e); } } else { // wrapper method if (super.getModelConfig().getDataSet().getSource() == SourceType.HDFS && super.getModelConfig().getBasic().getRunMode() == RunMode.mapred) { if (Constants.WRAPPER_BY_SE.equalsIgnoreCase(modelConfig.getVarSelect().getWrapperBy()) || Constants.WRAPPER_BY_REMOVE .equalsIgnoreCase(modelConfig.getVarSelect().getWrapperBy())) { // SE method supports remove and sensitivity se so far validateDistributedWrapperVarSelect(); syncDataToHdfs(super.modelConfig.getDataSet().getSource()); distributedSEWrapper(); } else if (Constants.WRAPPER_BY_VOTED.equalsIgnoreCase(modelConfig.getVarSelect().getWrapperBy())) { votedVariablesSelection(); } } else { // local wrapper mode: old wrapper(selector); } } clearUp(ModelStep.VARSELECT); log.info("Step Finished: varselect with {} ms", (System.currentTimeMillis() - start)); return 0; } private void validateNormalize() throws IOException { if (!ShifuFileUtils.isFileExists( new PathFinder(modelConfig).getNormalizedDataPath(this.modelConfig.getDataSet().getSource()), this.modelConfig.getDataSet().getSource())) { throw new IllegalStateException("Cannot find normalized data, please do 'Shifu normalize' firstly."); } } private void validateDistributedWrapperVarSelect() { if (!(Constants.WRAPPER_BY_REMOVE.equalsIgnoreCase(this.modelConfig.getVarSelectWrapperBy()) || Constants.WRAPPER_BY_SE.equalsIgnoreCase(this.modelConfig.getVarSelectWrapperBy()))) { throw new IllegalArgumentException( "Only R(Remove) and SE(Sensitivity Selection) wrapperBy methods are supported so far in distributed variable selection."); } if (!NNConstants.NN_ALG_NAME.equalsIgnoreCase(super.getModelConfig().getTrain().getAlgorithm())) { throw new IllegalArgumentException( "Currently we only support NN distributed training to do wrapper by analyzing variable selection."); } if (super.getModelConfig().getDataSet().getSource() != SourceType.HDFS) { throw new IllegalArgumentException( "Currently we only support distributed wrapper by analyzing on HDFS source type."); } if (super.getModelConfig().getBasic().getRunMode() != RunMode.mapred) { throw new IllegalArgumentException( "Currently we only support distributed wrapper by analyzing on HDFS source type."); } } private void votedVariablesSelection() throws ClassNotFoundException, IOException, InterruptedException { log.info("Start voted variables selection "); // sync data back to hdfs super.syncDataToHdfs(modelConfig.getDataSet().getSource()); SourceType sourceType = super.getModelConfig().getDataSet().getSource(); final List<String> args = new ArrayList<String>(); // prepare parameter prepareVarSelParams(args, sourceType); Path columnIdsPath = getVotedSelectionPath(sourceType); args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, ml.shifu.shifu.util.Constants.VAR_SEL_COLUMN_IDS_OUPUT, columnIdsPath.toString())); long start = System.currentTimeMillis(); GuaguaMapReduceClient guaguaClient = new GuaguaMapReduceClient(); guaguaClient.createJob(args.toArray(new String[0])).waitForCompletion(true); log.info("Voted variables selection finished in {}ms.", System.currentTimeMillis() - start); persistColumnIds(columnIdsPath); super.syncDataToHdfs(sourceType); } private int persistColumnIds(Path path) { try { List<Scanner> scanners = ShifuFileUtils.getDataScanners(path.toString(), modelConfig.getDataSet().getSource()); List<Integer> ids = null; for (Scanner scanner : scanners) { while (scanner.hasNextLine()) { String[] raw = scanner.nextLine().trim().split("\\|"); @SuppressWarnings("unused") int idSize = Integer.parseInt(raw[0]); ids = CommonUtils.stringToIntegerList(raw[1]); } } // prevent multiply running setting for (ColumnConfig config : columnConfigList) { if (!config.isForceSelect()) { config.setFinalSelect(Boolean.FALSE); } } for (Integer id : ids) { this.columnConfigList.get(id).setFinalSelect(Boolean.TRUE); } super.saveColumnConfigListAndColumnStats(); } catch (IOException e) { e.printStackTrace(); return -1; } catch (IllegalArgumentException e) { e.printStackTrace(); return -1; } return 0; } private Path getVotedSelectionPath(SourceType sourceType) { return ShifuFileUtils.getFileSystemBySourceType(sourceType) .makeQualified(new Path(getPathFinder().getVarSelsPath(sourceType), "VarSels")); } @SuppressWarnings("unused") private void prepareVarSelParams(final List<String> args, final SourceType sourceType) { args.add("-libjars"); args.add(addRuntimeJars()); args.add("-i"); args.add(ShifuFileUtils.getFileSystemBySourceType(sourceType) .makeQualified(new Path(modelConfig.getDataSetRawPath())).toString()); String zkServers = Environment.getProperty(Environment.ZOO_KEEPER_SERVERS); if (StringUtils.isEmpty(zkServers)) { log.warn( "No specified zookeeper settings from zookeeperServers in shifuConfig file, Guagua will set embeded zookeeper server in client process. For big data applications, specified zookeeper servers are strongly recommended."); } else { args.add("-z"); args.add(zkServers); } // setting the class args.add("-w"); args.add(VarSelWorker.class.getName()); args.add("-m"); args.add(VarSelMaster.class.getName()); args.add("-c"); // the reason to add 1 is that the first iteration in D-NN implementation is used for training preparation. // FIXME, how to set iteration number int expectVarCount = this.modelConfig.getVarSelectFilterNum(); int forceSelectCount = 0; int candidateCount = 0; for (ColumnConfig columnConfig : columnConfigList) { if (columnConfig.isForceSelect()) { forceSelectCount++; } if (CommonUtils.isGoodCandidate(columnConfig)) { candidateCount++; } } int iterationCnt = (Integer) this.modelConfig.getVarSelect().getParams() .get(CandidateGenerator.POPULATION_MULTIPLY_CNT) + 1; args.add(Integer.toString(iterationCnt)); args.add("-mr"); args.add(VarSelMasterResult.class.getName()); args.add("-wr"); args.add(VarSelWorkerResult.class.getName()); // setting conductor args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, ml.shifu.shifu.util.Constants.VAR_SEL_MASTER_CONDUCTOR, Environment.getProperty( Environment.VAR_SEL_MASTER_CONDUCTOR, WrapperMasterConductor.class.getName()))); args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, ml.shifu.shifu.util.Constants.VAR_SEL_WORKER_CONDUCTOR, Environment.getProperty( Environment.VAR_SEL_MASTER_CONDUCTOR, WrapperWorkerConductor.class.getName()))); // setting queue args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, NNConstants.MAPRED_JOB_QUEUE_NAME, Environment .getProperty(Environment.HADOOP_JOB_QUEUE, ml.shifu.shifu.util.Constants.DEFAULT_JOB_QUEUE))); // MAPRED timeout args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, NNConstants.MAPRED_TASK_TIMEOUT, Environment .getInt(NNConstants.MAPRED_TASK_TIMEOUT, ml.shifu.shifu.util.Constants.DEFAULT_MAPRED_TIME_OUT))); args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, GuaguaConstants.GUAGUA_MASTER_INTERCEPTERS, VarSelOutput.class.getName())); // setting model config column config args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, NNConstants.SHIFU_NN_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(sourceType) .makeQualified(new Path(super.getPathFinder().getModelConfigPath(sourceType))))); args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, NNConstants.SHIFU_NN_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(sourceType) .makeQualified(new Path(super.getPathFinder().getColumnConfigPath(sourceType))))); // source type args.add( String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, NNConstants.NN_MODELSET_SOURCE_TYPE, sourceType)); // computation time args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, GuaguaConstants.GUAGUA_COMPUTATION_TIME_THRESHOLD, 60 * 60 * 1000l)); setHeapSizeAndSplitSize(args); // one can set guagua conf in shifuconfig for (Map.Entry<Object, Object> entry : Environment.getProperties().entrySet()) { if (entry.getKey().toString().startsWith("nn") || entry.getKey().toString().startsWith("guagua") || entry.getKey().toString().startsWith("mapred")) { args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, entry.getKey().toString(), entry.getValue().toString())); } } } // GuaguaOptionsParser doesn't to support *.jar currently. private String addRuntimeJars() { List<String> jars = new ArrayList<String>(16); // jackson-databind-*.jar jars.add(JarManager.findContainingJar(ObjectMapper.class)); // jackson-core-*.jar jars.add(JarManager.findContainingJar(JsonParser.class)); // jackson-annotations-*.jar jars.add(JarManager.findContainingJar(JsonIgnore.class)); // commons-compress-*.jar jars.add(JarManager.findContainingJar(BZip2CompressorInputStream.class)); // commons-lang-*.jar jars.add(JarManager.findContainingJar(StringUtils.class)); // commons-collections-*.jar jars.add(JarManager.findContainingJar(ListUtils.class)); // common-io-*.jar jars.add(JarManager.findContainingJar(org.apache.commons.io.IOUtils.class)); // guava-*.jar jars.add(JarManager.findContainingJar(Splitter.class)); // encog-core-*.jar jars.add(JarManager.findContainingJar(MLDataSet.class)); // shifu-*.jar jars.add(JarManager.findContainingJar(getClass())); // guagua-core-*.jar jars.add(JarManager.findContainingJar(GuaguaConstants.class)); // guagua-mapreduce-*.jar jars.add(JarManager.findContainingJar(GuaguaMapReduceConstants.class)); // zookeeper-*.jar jars.add(JarManager.findContainingJar(ZooKeeper.class)); // netty-*.jar jars.add(JarManager.findContainingJar(ServerBootstrap.class)); jars.add(JarManager.findContainingJar(JexlException.class)); String hdpVersion = HDPUtils.getHdpVersionForHDP224(); if (StringUtils.isNotBlank(hdpVersion)) { // for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path jars.add(HDPUtils.findContainingFile("hdfs-site.xml")); jars.add(HDPUtils.findContainingFile("core-site.xml")); jars.add(HDPUtils.findContainingFile("mapred-site.xml")); jars.add(HDPUtils.findContainingFile("yarn-site.xml")); } return StringUtils.join(jars, NNConstants.LIB_JAR_SEPARATOR); } /** * Wrapper through {@link TrainModelProcessor} and a MapReduce job to analyze biggest sensitivity RMS. */ private void distributedSEWrapper() throws Exception { // 1. Train a model using current selected variables, if no variables selected, use all candidate variables. TrainModelProcessor trainModelProcessor = new TrainModelProcessor(); trainModelProcessor.setForVarSelect(true); trainModelProcessor.run(); // 2. Submit a MapReduce job to analyze sensitivity RMS. SourceType source = this.modelConfig.getDataSet().getSource(); Configuration conf = new Configuration(); // 2.1 prepare se job conf prepareSEJobConf(source, conf); // 2.2 get output path String varSelectMSEOutputPath = super.getPathFinder().getVarSelectMSEOutputPath(source); // 2.3 create se job Job job = createSEMapReduceJob(source, conf, varSelectMSEOutputPath); // 2.4 clean output firstly ShifuFileUtils.deleteFile(varSelectMSEOutputPath, source); // 2.5 submit job if (job.waitForCompletion(true)) { // 2.6 post process 4 var select if (super.modelConfig.getVarSelect().getFilterBySE()) { postProcess4SEVarSelect(source, varSelectMSEOutputPath); } else { log.info("Only print sensitivity analysis report."); log.info( "Sensitivity analysis report is in {}/{}-* file(s) with format 'column_index\tcolumn_name\tmean\trms\tvariance'.", varSelectMSEOutputPath, Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME); } } else { log.error("VarSelect SE hadoop job is failed, please re-try varselect step."); } } private Job createSEMapReduceJob(SourceType source, Configuration conf, String varSelectMSEOutputPath) throws IOException { @SuppressWarnings("deprecation") Job job = new Job(conf, "Shifu: Variable Selection Wrapper Job : " + this.modelConfig.getModelSetName()); job.setJarByClass(getClass()); boolean isSEVarSelMulti = Boolean.TRUE.toString().equalsIgnoreCase( Environment.getProperty(Constants.SHIFU_VARSEL_SE_MULTI, Constants.SHIFU_DEFAULT_VARSEL_SE_MULTI)); if (isSEVarSelMulti) { job.setMapperClass(MultithreadedMapper.class); MultithreadedMapper.setMapperClass(job, VarSelectMapper.class); int threads; try { threads = Integer.parseInt(Environment.getProperty(Constants.SHIFU_VARSEL_SE_MULTI_THREAD, Constants.SHIFU_DEFAULT_VARSEL_SE_MULTI_THREAD + "")); } catch (Exception e) { Log.warn("'shifu.varsel.se.multi.thread' should be a int value, set default value: {}", Constants.SHIFU_DEFAULT_VARSEL_SE_MULTI_THREAD); threads = Constants.SHIFU_DEFAULT_VARSEL_SE_MULTI_THREAD; } MultithreadedMapper.setNumberOfThreads(job, threads); } else { job.setMapperClass(VarSelectMapper.class); } job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(ColumnInfo.class); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source) .makeQualified(new Path(super.getPathFinder().getNormalizedDataPath()))); job.setReducerClass(VarSelectReducer.class); // Only one reducer, no need set combiner because of distinct keys in map outputs. job.setNumReduceTasks(1); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(varSelectMSEOutputPath)); MultipleOutputs.addNamedOutput(job, Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME, TextOutputFormat.class, Text.class, Text.class); return job; } private void prepareSEJobConf(SourceType source, Configuration conf) throws IOException { // add jars to hadoop mapper and reducer new GenericOptionsParser(conf, new String[] { "-libjars", addRuntimeJars() }); conf.setBoolean(GuaguaMapReduceConstants.MAPRED_MAP_TASKS_SPECULATIVE_EXECUTION, true); conf.setBoolean(GuaguaMapReduceConstants.MAPRED_REDUCE_TASKS_SPECULATIVE_EXECUTION, true); conf.set(Constants.SHIFU_MODEL_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source) .makeQualified(new Path(super.getPathFinder().getModelConfigPath(source))).toString()); conf.set(Constants.SHIFU_COLUMN_CONFIG, ShifuFileUtils.getFileSystemBySourceType(source) .makeQualified(new Path(super.getPathFinder().getColumnConfigPath(source))).toString()); conf.set(NNConstants.MAPRED_JOB_QUEUE_NAME, Environment.getProperty(Environment.HADOOP_JOB_QUEUE, "default")); conf.set(Constants.SHIFU_MODELSET_SOURCE_TYPE, source.toString()); // set mapreduce.job.max.split.locations to 30 to suppress warnings conf.setInt(GuaguaMapReduceConstants.MAPREDUCE_JOB_MAX_SPLIT_LOCATIONS, 30); // Tmp set to false because of some cluster by default use gzip while CombineInputFormat will split gzip file (a // bug) conf.setBoolean(CombineInputFormat.SHIFU_VS_SPLIT_COMBINABLE, false); conf.set("mapred.reduce.slowstart.completed.maps", Environment.getProperty("mapred.reduce.slowstart.completed.maps", "0.9")); Float wrapperRatio = this.modelConfig.getVarSelect().getWrapperRatio(); if (wrapperRatio == null) { log.warn("wrapperRatio in var select is not set. Using default value 0.05."); wrapperRatio = 0.05f; } if (wrapperRatio.compareTo(Float.valueOf(1.0f)) >= 0) { throw new IllegalArgumentException("WrapperRatio should be in (0, 1)."); } conf.setFloat(Constants.SHIFU_VARSELECT_WRAPPER_RATIO, wrapperRatio); String hdpVersion = HDPUtils.getHdpVersionForHDP224(); if (StringUtils.isNotBlank(hdpVersion)) { // for hdp 2.2.4, hdp.version should be set and configuration files should be add to container class path conf.set("hdp.version", hdpVersion); HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("hdfs-site.xml"), conf); HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("core-site.xml"), conf); HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("mapred-site.xml"), conf); HDPUtils.addFileToClassPath(HDPUtils.findContainingFile("yarn-site.xml"), conf); } } private void postProcess4SEVarSelect(SourceType source, String varSelectMSEOutputPath) throws IOException { String outputFilePattern = varSelectMSEOutputPath + Path.SEPARATOR + "part-r-*"; if (!ShifuFileUtils.isFileExists(outputFilePattern, source)) { throw new RuntimeException("Var select MSE stats output file not exist."); } int selectCnt = 0; for (ColumnConfig config : super.columnConfigList) { if (config.isFinalSelect()) { config.setFinalSelect(false); } // enable ForceSelect if (config.isForceSelect()) { config.setFinalSelect(true); selectCnt++; log.info("Variable {} is selected, since it is in ForceSelect list.", config.getColumnName()); } } List<Scanner> scanners = null; try { // here only works for 1 reducer FileStatus[] globStatus = ShifuFileUtils.getFileSystemBySourceType(source) .globStatus(new Path(outputFilePattern)); if (globStatus == null || globStatus.length == 0) { throw new RuntimeException("Var select MSE stats output file not exist."); } scanners = ShifuFileUtils.getDataScanners(globStatus[0].getPath().toString(), source); String str = null; int targetCnt = 0; // total variable count that user want to select List<Integer> candidateColumnIdList = new ArrayList<Integer>(); Scanner scanner = scanners.get(0); while (scanner.hasNext()) { ++targetCnt; str = scanner.nextLine().trim(); candidateColumnIdList.add(Integer.parseInt(str)); } int i = 0; // try to select another (targetCnt - selectCnt) variables, but we need to exclude those // force-selected variables while (selectCnt < targetCnt && i < targetCnt) { Integer columnId = candidateColumnIdList.get(i++); ColumnConfig columnConfig = this.columnConfigList.get(columnId); if (!columnConfig.isForceSelect() && !columnConfig.isForceRemove()) { columnConfig.setFinalSelect(true); selectCnt++; log.info("Variable {} is selected.", columnConfig.getColumnName()); } } log.info("{} variables are selected.", selectCnt); log.info( "Sensitivity analysis report is in {}/{}-* file(s) with format 'column_index\tcolumn_name\tmean\trms\tvariance'.", varSelectMSEOutputPath, Constants.SHIFU_VARSELECT_SE_OUTPUT_NAME); } finally { if (scanners != null) { for (Scanner scanner : scanners) { if (scanner != null) { scanner.close(); } } } } this.saveColumnConfigListAndColumnStats(); this.syncDataToHdfs(this.modelConfig.getDataSet().getSource()); } private void setHeapSizeAndSplitSize(final List<String> args) { // args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, GuaguaMapReduceConstants.MAPRED_CHILD_JAVA_OPTS, // "-Xmn128m -Xms1G -Xmx1G -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps")); args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, GuaguaMapReduceConstants.MAPRED_CHILD_JAVA_OPTS, "-Xmn128m -Xms1G -Xmx1G")); args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, GuaguaConstants.GUAGUA_SPLIT_COMBINABLE, Environment.getProperty(GuaguaConstants.GUAGUA_SPLIT_COMBINABLE, "true"))); args.add(String.format(NNConstants.MAPREDUCE_PARAM_FORMAT, GuaguaConstants.GUAGUA_SPLIT_MAX_COMBINED_SPLIT_SIZE, Environment.getProperty(GuaguaConstants.GUAGUA_SPLIT_MAX_COMBINED_SPLIT_SIZE, "268435456"))); } /** * user wrapper to select variable * * @param selector * @throws Exception */ private void wrapper(VariableSelector selector) throws Exception { NormalizeModelProcessor n = new NormalizeModelProcessor(); n.run(); TrainModelProcessor t = new TrainModelProcessor(false, false); t.run(); AbstractTrainer trainer = t.getTrainer(0); if (trainer instanceof NNTrainer) { selector.selectByWrapper((NNTrainer) trainer); try { this.saveColumnConfigListAndColumnStats(); } catch (ShifuException e) { throw new ShifuException(ShifuErrorCode.ERROR_WRITE_COLCONFIG, e); } } } }