List of usage examples for org.apache.commons.cli Option setArgs
public void setArgs(int num)
From source file:knowledgeMiner.preprocessing.KnowledgeMinerPreprocessor.java
/** * Main method for running the CycMiner in precomputation or verbose mode. * /*from ww w .jav a 2 s .c o m*/ * @param args * The args to determine which mode to run in. */ public static void main(String[] args) { ENABLE_PREPROCESSING = true; Options options = new Options(); options.addOption("m", false, "If precomputing mined data."); options.addOption("w", false, "If mapping from article to ontology."); options.addOption("o", false, "If mapping from ontology to article."); options.addOption("f", false, "Force all heuristics to be run, even if they have stored results."); options.addOption("i", true, "The article/concept to mine. Defaults to all."); options.addOption("R", false, "If the process should run in reverse order."); Option heurOption = new Option("h", true, "The heuristic(s) to use. Defaults to all."); heurOption.setArgs(20); options.addOption(heurOption); CommandLineParser parser = new BasicParser(); try { CommandLine parse = parser.parse(options, args); KnowledgeMinerPreprocessor kmp = getInstance(); kmp.run(parse); } catch (Exception e) { e.printStackTrace(); System.exit(1); } }
From source file:com.basistech.ninja.Train.java
/** * Command line interface to train a model. * * <pre>//from www . jav a 2 s. com * usage: Train [options] * --batch-size <arg> batch size (default = 10) * --epochs <arg> epochs (default = 5) * --examples <arg> input examples file (required) * --layer-sizes <arg> layer sizes, including input/output, e.g. 3 4 2 (required) * --learning-rate <arg> learning-rate (default = 0.7) * --model <arg> output model file (required) * </pre> * * @param args command line arguments * @throws IOException */ public static void main(String[] args) throws IOException { String defaultBatchSize = "10"; String deafaultEpochs = "5"; String defaultLearningRate = "0.7"; Options options = new Options(); Option option; option = new Option(null, "examples", true, "input examples file (required)"); option.setRequired(true); options.addOption(option); option = new Option(null, "model", true, "output model file (required)"); option.setRequired(true); options.addOption(option); option = new Option(null, "layer-sizes", true, "layer sizes, including input/output, e.g. 3 4 2 (required)"); option.setRequired(true); option.setArgs(Option.UNLIMITED_VALUES); options.addOption(option); option = new Option(null, "batch-size", true, String.format("batch size (default = %s)", defaultBatchSize)); options.addOption(option); option = new Option(null, "epochs", true, String.format("epochs (default = %s)", deafaultEpochs)); options.addOption(option); option = new Option(null, "learning-rate", true, String.format("learning-rate (default = %s)", defaultLearningRate)); options.addOption(option); CommandLineParser parser = new GnuParser(); CommandLine cmdline = null; try { cmdline = parser.parse(options, args); } catch (org.apache.commons.cli.ParseException e) { System.err.println(e.getMessage()); usage(options); System.exit(1); } String[] remaining = cmdline.getArgs(); if (remaining == null) { usage(options); System.exit(1); } List<Integer> layerSizes = Lists.newArrayList(); for (String s : cmdline.getOptionValues("layer-sizes")) { layerSizes.add(Integer.parseInt(s)); } File examplesFile = new File(cmdline.getOptionValue("examples")); Train that = new Train(layerSizes, examplesFile); int batchSize = Integer.parseInt(cmdline.getOptionValue("batch-size", defaultBatchSize)); int epochs = Integer.parseInt(cmdline.getOptionValue("epochs", deafaultEpochs)); double learningRate = Double.parseDouble(cmdline.getOptionValue("learning-rate", defaultLearningRate)); File modelFile = new File(cmdline.getOptionValue("model")); that.train(batchSize, epochs, learningRate, modelFile); }
From source file:edu.ksu.cis.indus.staticanalyses.callgraphs.CallGraphXMLizerCLI.java
/** * The entry point to the program via command line. * //from w ww. java 2 s . c om * @param args is the command line arguments. * @throws RuntimeException when the analyses fail. */ public static void main(final String[] args) { final Options _options = new Options(); Option _option = new Option("c", "cumulative", false, "Builds one call graph that includes all root methods."); _options.addOption(_option); _option = new Option("o", "output", true, "Directory into which xml files will be written into. Defaults to current directory if omitted"); _option.setArgs(1); _option.setArgName("output-dir"); _options.addOption(_option); _option = new Option("j", "jimple", false, "Dump xmlized jimple."); _option.setArgName("dump-jimple"); _options.addOption(_option); _option = new Option("p", "soot-classpath", true, "Prepend this to soot class path."); _option.setArgs(1); _option.setArgName("classpath"); _option.setOptionalArg(false); _options.addOption(_option); _option = new Option("h", "help", false, "Display message."); _option.setOptionalArg(false); _options.addOption(_option); _option = new Option("t", "call-graph-type", true, "Call graph type. This has to be one of {cha, rta, ofa-oi, " + "ofa-oirt, ofa-os}."); _option.setArgs(1); _option.setArgName("type"); _option.setRequired(true); _options.addOption(_option); _option = new Option("S", "scope", true, "The scope that should be analyzed."); _option.setArgs(1); _option.setArgName("scope"); _option.setRequired(false); _options.addOption(_option); final PosixParser _parser = new PosixParser(); try { final CommandLine _cl = _parser.parse(_options, args); if (_cl.hasOption('h')) { printUsage(_options); System.exit(1); } String _outputDir = _cl.getOptionValue('o'); if (_outputDir == null) { if (LOGGER.isWarnEnabled()) { LOGGER.warn("Defaulting to current directory for output."); } _outputDir = "."; } if (_cl.getArgList().isEmpty()) { throw new MissingArgumentException("Please specify atleast one class."); } final CallGraphXMLizerCLI _cli = new CallGraphXMLizerCLI(); _cli.xmlizer.setXmlOutputDir(_outputDir); _cli.xmlizer.setGenerator(new UniqueJimpleIDGenerator()); _cli.setCumulative(_cl.hasOption('c')); _cli.setClassNames(_cl.getArgList()); _cli.addToSootClassPath(_cl.getOptionValue('p')); if (_cl.hasOption('S')) { _cli.setScopeSpecFile(_cl.getOptionValue('S')); } _cli.initialize(); _cli.execute(_cl.hasOption('j'), _cl.getOptionValue('t')); } catch (final ParseException _e) { LOGGER.error("Error while parsing command line.", _e); System.out.println("Error while parsing command line." + _e); printUsage(_options); } catch (final Throwable _e) { LOGGER.error("Beyond our control. May day! May day!", _e); throw new RuntimeException(_e); } }
From source file:edu.ksu.cis.indus.staticanalyses.dependency.DependencyXMLizerCLI.java
/** * This is the entry point via command-line. * //from ww w . j ava 2 s . c o m * @param args is the command line arguments. * @throws RuntimeException when an Throwable exception beyond our control occurs. * @pre args != null */ public static void main(final String[] args) { final Options _options = new Options(); Option _option = new Option("o", "output", true, "Directory into which xml files will be written into. Defaults to current directory if omitted"); _option.setArgs(1); _option.setArgName("output-directory"); _options.addOption(_option); _option = new Option("j", "jimple", false, "Dump xmlized jimple."); _options.addOption(_option); final DivergenceDA _fidda = DivergenceDA.getDivergenceDA(IDependencyAnalysis.Direction.FORWARD_DIRECTION); _fidda.setConsiderCallSites(true); final DivergenceDA _bidda = DivergenceDA.getDivergenceDA(IDependencyAnalysis.Direction.BACKWARD_DIRECTION); _bidda.setConsiderCallSites(true); final NonTerminationSensitiveEntryControlDA _ncda = new NonTerminationSensitiveEntryControlDA(); final Object[][] _dasOptions = { { "ibdda1", "Identifier based data dependence (Soot)", new IdentifierBasedDataDA() }, { "ibdda2", "Identifier based data dependence (Indus)", new IdentifierBasedDataDAv2() }, { "ibdda3", "Identifier based data dependence (Indus Optimized)", new IdentifierBasedDataDAv3() }, { "rbdda", "Reference based data dependence", new ReferenceBasedDataDA() }, { "nscda", "Non-termination sensitive Entry control dependence", _ncda }, { "nicda", "Non-termination insensitive Entry control dependence", new NonTerminationInsensitiveEntryControlDA(), }, { "xcda", "Exit control dependence", new ExitControlDA() }, { "sda", "Synchronization dependence", new SynchronizationDA() }, { "frda1", "Forward Ready dependence v1", ReadyDAv1.getForwardReadyDA() }, { "brda1", "Backward Ready dependence v1", ReadyDAv1.getBackwardReadyDA() }, { "frda2", "Forward Ready dependence v2", ReadyDAv2.getForwardReadyDA() }, { "brda2", "Backward Ready dependence v2", ReadyDAv2.getBackwardReadyDA() }, { "frda3", "Forward Ready dependence v3", ReadyDAv3.getForwardReadyDA() }, { "brda3", "Backward Ready dependence v3", ReadyDAv3.getBackwardReadyDA() }, { "ida1", "Interference dependence v1", new InterferenceDAv1() }, { "ida2", "Interference dependence v2", new InterferenceDAv2() }, { "ida3", "Interference dependence v3", new InterferenceDAv3() }, { "fdda", "Forward Intraprocedural Divergence dependence", DivergenceDA.getDivergenceDA(IDependencyAnalysis.Direction.FORWARD_DIRECTION), }, { "bdda", "Backward Intraprocedural Divergence dependence", DivergenceDA.getDivergenceDA(IDependencyAnalysis.Direction.BACKWARD_DIRECTION), }, { "fidda", "Forward Intra+Interprocedural Divergence dependence", _fidda }, { "bidda", "Backward Intra+Interprocedural Divergence dependence", _bidda }, { "fpidda", "Forward Interprocedural Divergence dependence", InterProceduralDivergenceDA .getDivergenceDA(IDependencyAnalysis.Direction.FORWARD_DIRECTION), }, { "bpidda", "Backward Interprocedural Divergence dependence", InterProceduralDivergenceDA .getDivergenceDA(IDependencyAnalysis.Direction.BACKWARD_DIRECTION), }, }; _option = new Option("h", "help", false, "Display message."); _option.setOptionalArg(false); _options.addOption(_option); _option = new Option("p", "soot-classpath", false, "Prepend this to soot class path."); _option.setArgs(1); _option.setArgName("classpath"); _option.setOptionalArg(false); _options.addOption(_option); _option = new Option("aliasedusedefv1", false, "Use version 1 of aliased use-def info."); _options.addOption(_option); _option = new Option("safelockanalysis", false, "Use safe-lock-analysis for ready dependence."); _options.addOption(_option); _option = new Option("ofaforinterference", false, "Use OFA for interference dependence."); _options.addOption(_option); _option = new Option("ofaforready", false, "Use OFA for ready dependence."); _options.addOption(_option); _option = new Option("exceptionalexits", false, "Consider exceptional exits for control dependence."); _options.addOption(_option); _option = new Option("commonuncheckedexceptions", false, "Consider common unchecked exceptions."); _options.addOption(_option); _option = new Option("S", "scope", true, "The scope that should be analyzed."); _option.setArgs(1); _option.setArgName("scope"); _option.setRequired(false); _options.addOption(_option); for (int _i = 0; _i < _dasOptions.length; _i++) { final String _shortOption = _dasOptions[_i][0].toString(); final String _description = _dasOptions[_i][1].toString(); _option = new Option(_shortOption, false, _description); _options.addOption(_option); } final CommandLineParser _parser = new GnuParser(); try { final CommandLine _cl = _parser.parse(_options, args); if (_cl.hasOption("h")) { printUsage(_options); System.exit(1); } final DependencyXMLizerCLI _xmlizerCLI = new DependencyXMLizerCLI(); String _outputDir = _cl.getOptionValue('o'); if (_outputDir == null) { if (LOGGER.isWarnEnabled()) { LOGGER.warn("Defaulting to current directory for output."); } _outputDir = "."; } _xmlizerCLI.xmlizer.setXmlOutputDir(_outputDir); if (_cl.hasOption('p')) { _xmlizerCLI.addToSootClassPath(_cl.getOptionValue('p')); } if (_cl.hasOption('S')) { _xmlizerCLI.setScopeSpecFile(_cl.getOptionValue('S')); } _xmlizerCLI.dumpJimple = _cl.hasOption('j'); _xmlizerCLI.useAliasedUseDefv1 = _cl.hasOption("aliasedusedefv1"); _xmlizerCLI.useSafeLockAnalysis = _cl.hasOption("safelockanalysis"); _xmlizerCLI.exceptionalExits = _cl.hasOption("exceptionalexits"); _xmlizerCLI.commonUncheckedException = _cl.hasOption("commonuncheckedexceptions"); final List<String> _classNames = _cl.getArgList(); if (_classNames.isEmpty()) { throw new MissingArgumentException("Please specify atleast one class."); } _xmlizerCLI.setClassNames(_classNames); final int _exitControlDAIndex = 6; if (_cl.hasOption(_dasOptions[_exitControlDAIndex][0].toString())) { _xmlizerCLI.das.add(_ncda); for (final Iterator<DependenceSort> _i = _ncda.getIds().iterator(); _i.hasNext();) { final DependenceSort _id = _i.next(); MapUtils.putIntoCollectionInMapUsingFactory(_xmlizerCLI.info, _id, _ncda, SetUtils.getFactory()); } } if (!parseForDependenceOptions(_dasOptions, _cl, _xmlizerCLI)) { throw new ParseException("Atleast one dependence analysis must be requested."); } _xmlizerCLI.<ITokens>execute(); } catch (final ParseException _e) { LOGGER.error("Error while parsing command line.", _e); System.out.println("Error while parsing command line." + _e); printUsage(_options); } catch (final Throwable _e) { LOGGER.error("Beyond our control. May day! May day!", _e); throw new RuntimeException(_e); } }
From source file:LineageSimulator.java
public static void main(String[] args) { Options options = new Options(); // commands// w w w.j av a 2 s .co m //options.addOption("simulate", false, "Simulate lineage trees"); //options.addOption("sample", false, "Sample from the simulated trees"); //options.addOption("evaluate", false, "Evaluate trees"); // tree simulation options.addOption("t", "nTrees", true, "Number of trees to simulate (default: 100)"); options.addOption("i", "nIter", true, "Number of tree growth iterations (default: 50)"); options.addOption("snv", "probSNV", true, "Per node probablity of generating a descendant cell population with an acquired SNV during a tree growth iteration (default: 0.15)"); options.addOption("cnv", "probCNV", true, "Per node probablity of generating a descendant cell population with an acquired CNV during a tree growth iteration (default: 0.02)"); options.addOption("probDeath", true, "Probablity of a cell population death in each tree growth iteration (default: 0.06)"); options.addOption("maxPopulationSize", true, "Max size of a cell population (default: 1000000)"); options.addOption("minNodes", true, "Minimum number of undead cell population nodes in a valid tree, tree growth will continue beyond the defined number of iterations until this value is reached (default: 10)"); options.addOption("maxNodes", true, "Maximum number of undead cell population nodes in a tree, tree growth will stop after the iteration in which this value is reached/first surpassed (default: 1000)"); // sampling Option samplesOption = new Option("s", "nSamples", true, "Number of samples to collect, accepts multiple values, e.g. 5 10 15 (default: 5)"); samplesOption.setArgs(Option.UNLIMITED_VALUES); options.addOption(samplesOption); Option covOption = new Option("c", "coverage", true, "Simulated coverage to generate the VAFs, accepts multiple values, e.g. 500 1000 (default: 1000)"); covOption.setArgs(Option.UNLIMITED_VALUES); options.addOption(covOption); options.addOption("maxSubclones", true, "Max number of subclones per sample (default: 5)"); options.addOption("sampleSize", true, "Number of cells per sample (default: 100000)"); options.addOption("e", true, "Sequencing error (default: 0.001)"); options.addOption("minNC", true, "Minimum percentage of normal contamination per sample; the percentage will be randomly generated from the range [minNC maxNC] for each sample (default: 0)"); options.addOption("maxNC", true, "Maximum percentage of normal contamination per sample; if maxNC < minNC, maxNC will be automatically set to minNC; the percentage will be randomly generated from the range [minNC maxNC] for each sample (default: 20)"); //options.addOption("localized", false, "Enable localized sampling (default: random sampling)"); //options.addOption("mixSubclone", false, "With localized sampling, add an additional subclone from a different subtree to each sample; by default, the sample is localized to a single disjoint subtree"); // input/output/display options.addOption("dir", "outputDir", true, "Directory where the output files should be created [required]"); options.addOption("dot", false, "Produce DOT files for the simulated trees"); options.addOption("sdot", "sampledDot", false, "Produce DOT files for the simulated trees with indicated samples"); options.addOption("sampleProfile", false, "Output VAF file includes an additional column with the binary sample profile for each SNV"); // other options.addOption("v", "verbose", false, "Verbose mode"); options.addOption("h", "help", false, "Print usage"); // display order ArrayList<Option> optionsList = new ArrayList<Option>(); optionsList.add(options.getOption("dir")); optionsList.add(options.getOption("t")); optionsList.add(options.getOption("i")); optionsList.add(options.getOption("snv")); optionsList.add(options.getOption("cnv")); optionsList.add(options.getOption("probDeath")); optionsList.add(options.getOption("maxPopulationSize")); optionsList.add(options.getOption("minNodes")); optionsList.add(options.getOption("maxNodes")); optionsList.add(options.getOption("s")); optionsList.add(options.getOption("c")); optionsList.add(options.getOption("maxSubclones")); optionsList.add(options.getOption("sampleSize")); optionsList.add(options.getOption("e")); optionsList.add(options.getOption("minNC")); optionsList.add(options.getOption("maxNC")); optionsList.add(options.getOption("dot")); optionsList.add(options.getOption("sdot")); optionsList.add(options.getOption("sampleProfile")); optionsList.add(options.getOption("v")); optionsList.add(options.getOption("h")); CommandLineParser parser = new BasicParser(); CommandLine cmdLine = null; HelpFormatter hf = new HelpFormatter(); hf.setOptionComparator(new OptionComarator<Option>(optionsList)); try { cmdLine = parser.parse(options, args); } catch (ParseException e) { System.err.println(e.getMessage()); hf.printHelp(PROG_NAME, options); System.exit(-1); } Args params = new Args(); if (cmdLine.hasOption("dir")) { params.simPath = cmdLine.getOptionValue("dir") + "/" + SIMULATION_DATA_DIR; } else { System.err.println("Required parameter: output directory path [-dir]"); hf.printHelp(PROG_NAME, options); System.exit(-1); } if (cmdLine.hasOption("t")) { Parameters.NUM_TREES = Integer.parseInt(cmdLine.getOptionValue("t")); } if (cmdLine.hasOption("i")) { Parameters.NUM_ITERATIONS = Integer.parseInt(cmdLine.getOptionValue("i")); } if (cmdLine.hasOption("snv")) { Parameters.PROB_SNV = Double.parseDouble(cmdLine.getOptionValue("snv")); } if (cmdLine.hasOption("cnv")) { Parameters.PROB_CNV = Double.parseDouble(cmdLine.getOptionValue("cnv")); } if (cmdLine.hasOption("probDeath")) { Parameters.PROB_DEATH = Double.parseDouble(cmdLine.getOptionValue("probDeath")); } if (cmdLine.hasOption("maxPopulationSize")) { Parameters.MAX_POPULATION_SIZE = Integer.parseInt(cmdLine.getOptionValue("maxPopulationSize")); } if (cmdLine.hasOption("minNodes")) { Parameters.MIN_NUM_NODES = Integer.parseInt(cmdLine.getOptionValue("minNodes")); if (Parameters.MIN_NUM_NODES < 1) { System.err.println("Minimum number of nodes [-minNodes] must be at least 1"); System.exit(-1); } } if (cmdLine.hasOption("maxNodes")) { Parameters.MAX_NUM_NODES = Integer.parseInt(cmdLine.getOptionValue("maxNodes")); if (Parameters.MAX_NUM_NODES < 1 || Parameters.MAX_NUM_NODES < Parameters.MIN_NUM_NODES) { System.err.println( "Maximum number of nodes [-maxNodes] must be at least 1 and not less than [-minNodes]"); System.exit(-1); } } if (cmdLine.hasOption("s")) { String[] samples = cmdLine.getOptionValues("s"); Parameters.NUM_SAMPLES_ARRAY = new int[samples.length]; for (int i = 0; i < samples.length; i++) { Parameters.NUM_SAMPLES_ARRAY[i] = Integer.parseInt(samples[i]); } } if (cmdLine.hasOption("c")) { String[] cov = cmdLine.getOptionValues("c"); Parameters.COVERAGE_ARRAY = new int[cov.length]; for (int i = 0; i < cov.length; i++) { Parameters.COVERAGE_ARRAY[i] = Integer.parseInt(cov[i]); } } if (cmdLine.hasOption("maxSubclones")) { Parameters.MAX_NUM_SUBCLONES = Integer.parseInt(cmdLine.getOptionValue("maxSubclones")); } if (cmdLine.hasOption("sampleSize")) { Parameters.NUM_CELLS_PER_SAMPLE = Integer.parseInt(cmdLine.getOptionValue("sampleSize")); } if (cmdLine.hasOption("e")) { Parameters.SEQUENCING_ERROR = Double.parseDouble(cmdLine.getOptionValue("e")); } if (cmdLine.hasOption("minNC")) { Parameters.MIN_PERCENT_NORMAL_CONTAMINATION = Double.parseDouble(cmdLine.getOptionValue("minNC")); } if (cmdLine.hasOption("maxNC")) { Parameters.MAX_PERCENT_NORMAL_CONTAMINATION = Double.parseDouble(cmdLine.getOptionValue("maxNC")); } if (Parameters.MAX_PERCENT_NORMAL_CONTAMINATION < Parameters.MIN_PERCENT_NORMAL_CONTAMINATION) { Parameters.MAX_PERCENT_NORMAL_CONTAMINATION = Parameters.MIN_PERCENT_NORMAL_CONTAMINATION; } /*if(cmdLine.hasOption("localized")) { Parameters.LOCALIZED_SAMPLING = true; } if(cmdLine.hasOption("mixSubclone")) { Parameters.MIX_NBR_SUBTREE_SUBCLONE = true; }*/ if (cmdLine.hasOption("dot")) { params.generateDOT = true; } if (cmdLine.hasOption("sampledDot")) { params.generateSampledDOT = true; } if (cmdLine.hasOption("sampleProfile")) { params.outputSampleProfile = true; } if (cmdLine.hasOption("h")) { new HelpFormatter().printHelp(" ", options); } // logger ConsoleHandler h = new ConsoleHandler(); h.setFormatter(new LogFormatter()); h.setLevel(Level.INFO); logger.setLevel(Level.INFO); if (cmdLine.hasOption("v")) { h.setLevel(Level.FINEST); logger.setLevel(Level.FINEST); } logger.addHandler(h); logger.setUseParentHandlers(false); // validate settings if (Parameters.PROB_SNV + Parameters.PROB_CNV + Parameters.PROB_DEATH > 1) { System.err.println("The sum of SSNV, CNV, and cell death probabilities cannot exceed 1"); hf.printHelp(PROG_NAME, options); System.exit(-1); } simulateLineageTrees(params); }
From source file:edu.nyu.vida.data_polygamy.pre_processing.PreProcessing.java
/** * @param args//from w w w . ja v a 2 s. c o m * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ @SuppressWarnings("deprecation") public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Options options = new Options(); Option nameOption = new Option("dn", "name", true, "the name of the dataset"); nameOption.setRequired(true); nameOption.setArgName("DATASET NAME"); options.addOption(nameOption); Option headerOption = new Option("dh", "header", true, "the file that contains the header of the dataset"); headerOption.setRequired(true); headerOption.setArgName("DATASET HEADER FILE"); options.addOption(headerOption); Option deafultsOption = new Option("dd", "defaults", true, "the file that contains the default values of the dataset"); deafultsOption.setRequired(true); deafultsOption.setArgName("DATASET DEFAULTS FILE"); options.addOption(deafultsOption); Option tempResOption = new Option("t", "temporal", true, "desired temporal resolution (hour, day, week, or month)"); tempResOption.setRequired(true); tempResOption.setArgName("TEMPORAL RESOLUTION"); options.addOption(tempResOption); Option spatialResOption = new Option("s", "spatial", true, "desired spatial resolution (points, nbhd, zip, grid, or city)"); spatialResOption.setRequired(true); spatialResOption.setArgName("SPATIAL RESOLUTION"); options.addOption(spatialResOption); Option currentSpatialResOption = new Option("cs", "current-spatial", true, "current spatial resolution (points, nbhd, zip, grid, or city)"); currentSpatialResOption.setRequired(true); currentSpatialResOption.setArgName("CURRENT SPATIAL RESOLUTION"); options.addOption(currentSpatialResOption); Option indexResOption = new Option("i", "index", true, "indexes of the temporal and spatial attributes"); indexResOption.setRequired(true); indexResOption.setArgName("INDEX OF SPATIO-TEMPORAL RESOLUTIONS"); indexResOption.setArgs(Option.UNLIMITED_VALUES); options.addOption(indexResOption); Option machineOption = new Option("m", "machine", true, "machine identifier"); machineOption.setRequired(true); machineOption.setArgName("MACHINE"); machineOption.setArgs(1); options.addOption(machineOption); Option nodesOption = new Option("n", "nodes", true, "number of nodes"); nodesOption.setRequired(true); nodesOption.setArgName("NODES"); nodesOption.setArgs(1); options.addOption(nodesOption); Option s3Option = new Option("s3", "s3", false, "data on Amazon S3"); s3Option.setRequired(false); options.addOption(s3Option); Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true, "aws access key id; " + "this is required if the execution is on aws"); awsAccessKeyIdOption.setRequired(false); awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID"); awsAccessKeyIdOption.setArgs(1); options.addOption(awsAccessKeyIdOption); Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true, "aws secrect access key; " + "this is required if the execution is on aws"); awsSecretAccessKeyOption.setRequired(false); awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY"); awsSecretAccessKeyOption.setArgs(1); options.addOption(awsSecretAccessKeyOption); Option bucketOption = new Option("b", "s3-bucket", true, "bucket on s3; " + "this is required if the execution is on aws"); bucketOption.setRequired(false); bucketOption.setArgName("S3-BUCKET"); bucketOption.setArgs(1); options.addOption(bucketOption); Option helpOption = new Option("h", "help", false, "display this message"); helpOption.setRequired(false); options.addOption(helpOption); HelpFormatter formatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.pre_processing.PreProcessing", options, true); System.exit(0); } if (cmd.hasOption("h")) { formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.pre_processing.PreProcessing", options, true); System.exit(0); } boolean s3 = cmd.hasOption("s3"); String s3bucket = ""; String awsAccessKeyId = ""; String awsSecretAccessKey = ""; if (s3) { if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) { System.out.println( "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS."); formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.pre_processing.PreProcessing", options, true); System.exit(0); } s3bucket = cmd.getOptionValue("b"); awsAccessKeyId = cmd.getOptionValue("aws_id"); awsSecretAccessKey = cmd.getOptionValue("aws_key"); } boolean snappyCompression = false; boolean bzip2Compression = false; String machine = cmd.getOptionValue("m"); int nbNodes = Integer.parseInt(cmd.getOptionValue("n")); Configuration s3conf = new Configuration(); if (s3) { s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); s3conf.set("bucket", s3bucket); } Configuration conf = new Configuration(); Machine machineConf = new Machine(machine, nbNodes); String dataset = cmd.getOptionValue("dn"); String header = cmd.getOptionValue("dh"); String defaults = cmd.getOptionValue("dd"); String temporalResolution = cmd.getOptionValue("t"); String spatialResolution = cmd.getOptionValue("s"); String gridResolution = ""; String currentSpatialResolution = cmd.getOptionValue("cs"); if (spatialResolution.contains("grid")) { String[] res = spatialResolution.split("-"); spatialResolution = res[0]; gridResolution = res[1]; } conf.set("header", s3bucket + FrameworkUtils.dataDir + "/" + header); conf.set("defaults", s3bucket + FrameworkUtils.dataDir + "/" + defaults); conf.set("temporal-resolution", temporalResolution); conf.set("spatial-resolution", spatialResolution); conf.set("grid-resolution", gridResolution); conf.set("current-spatial-resolution", currentSpatialResolution); String[] indexes = cmd.getOptionValues("i"); String temporalPos = ""; Integer sizeSpatioTemp = 0; if (!(currentSpatialResolution.equals("points"))) { String spatialPos = ""; for (int i = 0; i < indexes.length; i++) { temporalPos += indexes[i] + ","; spatialPos += indexes[++i] + ","; sizeSpatioTemp++; } conf.set("spatial-pos", spatialPos); } else { String xPositions = "", yPositions = ""; for (int i = 0; i < indexes.length; i++) { temporalPos += indexes[i] + ","; xPositions += indexes[++i] + ","; yPositions += indexes[++i] + ","; sizeSpatioTemp++; } conf.set("xPositions", xPositions); conf.set("yPositions", yPositions); } conf.set("temporal-pos", temporalPos); conf.set("size-spatio-temporal", sizeSpatioTemp.toString()); // checking resolutions if (utils.spatialResolution(spatialResolution) < 0) { System.out.println("Invalid spatial resolution: " + spatialResolution); System.exit(-1); } if (utils.spatialResolution(spatialResolution) == FrameworkUtils.POINTS) { System.out.println("The data needs to be reduced at least to neighborhoods or grid."); System.exit(-1); } if (utils.spatialResolution(currentSpatialResolution) < 0) { System.out.println("Invalid spatial resolution: " + currentSpatialResolution); System.exit(-1); } if (utils.spatialResolution(currentSpatialResolution) > utils.spatialResolution(spatialResolution)) { System.out.println("The current spatial resolution is coarser than " + "the desired one. You can only navigate from a fine resolution" + " to a coarser one."); System.exit(-1); } if (utils.temporalResolution(temporalResolution) < 0) { System.out.println("Invalid temporal resolution: " + temporalResolution); System.exit(-1); } String fileName = s3bucket + FrameworkUtils.preProcessingDir + "/" + dataset + "-" + temporalResolution + "-" + spatialResolution + gridResolution; conf.set("aggregates", fileName + ".aggregates"); // making sure both files are removed, if they exist FrameworkUtils.removeFile(fileName, s3conf, s3); FrameworkUtils.removeFile(fileName + ".aggregates", s3conf, s3); /** * Hadoop Parameters * sources: http://www.slideshare.net/ImpetusInfo/ppt-on-advanced-hadoop-tuning-n-optimisation * https://cloudcelebrity.wordpress.com/2013/08/14/12-key-steps-to-keep-your-hadoop-cluster-running-strong-and-performing-optimum/ */ conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1"); conf.set("mapreduce.reduce.shuffle.parallelcopies", "20"); conf.set("mapreduce.input.fileinputformat.split.minsize", "0"); conf.set("mapreduce.task.io.sort.mb", "200"); conf.set("mapreduce.task.io.sort.factor", "100"); // using SnappyCodec for intermediate and output data ? // TODO: for now, using SnappyCodec -- what about LZO + Protocol Buffer serialization? // LZO - http://www.oberhumer.com/opensource/lzo/#download // Hadoop-LZO - https://github.com/twitter/hadoop-lzo // Protocol Buffer - https://github.com/twitter/elephant-bird // General Info - http://www.devx.com/Java/Article/47913 // Compression - http://comphadoop.weebly.com/index.html if (snappyCompression) { conf.set("mapreduce.map.output.compress", "true"); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); } if (bzip2Compression) { conf.set("mapreduce.map.output.compress", "true"); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); } // TODO: this is dangerous! if (s3) { conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); } Job job = new Job(conf); job.setJobName(dataset + "-" + temporalResolution + "-" + spatialResolution); job.setMapOutputKeyClass(MultipleSpatioTemporalWritable.class); job.setMapOutputValueClass(AggregationArrayWritable.class); job.setOutputKeyClass(MultipleSpatioTemporalWritable.class); job.setOutputValueClass(AggregationArrayWritable.class); job.setMapperClass(PreProcessingMapper.class); job.setCombinerClass(PreProcessingCombiner.class); job.setReducerClass(PreProcessingReducer.class); job.setNumReduceTasks(machineConf.getNumberReduces()); //job.setNumReduceTasks(1); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); FileInputFormat.setInputPaths(job, new Path(s3bucket + FrameworkUtils.dataDir + "/" + dataset)); FileOutputFormat.setOutputPath(job, new Path(fileName)); job.setJarByClass(PreProcessing.class); long start = System.currentTimeMillis(); job.submit(); job.waitForCompletion(true); System.out.println(fileName + "\t" + (System.currentTimeMillis() - start)); }
From source file:edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques.java
/** * @param args/*w ww .j a v a 2 s. co m*/ * @throws ParseException */ @SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Options options = new Options(); Option forceOption = new Option("f", "force", false, "force the computation of the relationship " + "even if files already exist"); forceOption.setRequired(false); options.addOption(forceOption); Option g1Option = new Option("g1", "first-group", true, "set first group of datasets"); g1Option.setRequired(true); g1Option.setArgName("FIRST GROUP"); g1Option.setArgs(Option.UNLIMITED_VALUES); options.addOption(g1Option); Option g2Option = new Option("g2", "second-group", true, "set second group of datasets"); g2Option.setRequired(false); g2Option.setArgName("SECOND GROUP"); g2Option.setArgs(Option.UNLIMITED_VALUES); options.addOption(g2Option); Option machineOption = new Option("m", "machine", true, "machine identifier"); machineOption.setRequired(true); machineOption.setArgName("MACHINE"); machineOption.setArgs(1); options.addOption(machineOption); Option nodesOption = new Option("n", "nodes", true, "number of nodes"); nodesOption.setRequired(true); nodesOption.setArgName("NODES"); nodesOption.setArgs(1); options.addOption(nodesOption); Option s3Option = new Option("s3", "s3", false, "data on Amazon S3"); s3Option.setRequired(false); options.addOption(s3Option); Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true, "aws access key id; " + "this is required if the execution is on aws"); awsAccessKeyIdOption.setRequired(false); awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID"); awsAccessKeyIdOption.setArgs(1); options.addOption(awsAccessKeyIdOption); Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true, "aws secrect access key; " + "this is required if the execution is on aws"); awsSecretAccessKeyOption.setRequired(false); awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY"); awsSecretAccessKeyOption.setArgs(1); options.addOption(awsSecretAccessKeyOption); Option bucketOption = new Option("b", "s3-bucket", true, "bucket on s3; " + "this is required if the execution is on aws"); bucketOption.setRequired(false); bucketOption.setArgName("S3-BUCKET"); bucketOption.setArgs(1); options.addOption(bucketOption); Option helpOption = new Option("h", "help", false, "display this message"); helpOption.setRequired(false); options.addOption(helpOption); HelpFormatter formatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques", options, true); System.exit(0); } if (cmd.hasOption("h")) { formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques", options, true); System.exit(0); } boolean s3 = cmd.hasOption("s3"); String s3bucket = ""; String awsAccessKeyId = ""; String awsSecretAccessKey = ""; if (s3) { if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) { System.out.println( "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS."); formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques", options, true); System.exit(0); } s3bucket = cmd.getOptionValue("b"); awsAccessKeyId = cmd.getOptionValue("aws_id"); awsSecretAccessKey = cmd.getOptionValue("aws_key"); } boolean snappyCompression = false; boolean bzip2Compression = false; String machine = cmd.getOptionValue("m"); int nbNodes = Integer.parseInt(cmd.getOptionValue("n")); Configuration s3conf = new Configuration(); if (s3) { s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); s3conf.set("bucket", s3bucket); } Path path = null; FileSystem fs = FileSystem.get(new Configuration()); ArrayList<String> shortDataset = new ArrayList<String>(); ArrayList<String> firstGroup = new ArrayList<String>(); ArrayList<String> secondGroup = new ArrayList<String>(); HashMap<String, String> datasetAgg = new HashMap<String, String>(); boolean removeExistingFiles = cmd.hasOption("f"); String[] firstGroupCmd = cmd.getOptionValues("g1"); String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : new String[0]; addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket); addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket); if (shortDataset.size() == 0) { System.out.println("No datasets to process."); System.exit(0); } if (firstGroup.isEmpty()) { System.out.println("First group of datasets (G1) is empty. " + "Doing G1 = G2."); firstGroup.addAll(secondGroup); } if (secondGroup.isEmpty()) { System.out.println("Second group of datasets (G2) is empty. " + "Doing G2 = G1."); secondGroup.addAll(firstGroup); } // getting dataset ids String datasetNames = ""; String datasetIds = ""; HashMap<String, String> datasetId = new HashMap<String, String>(); Iterator<String> it = shortDataset.iterator(); while (it.hasNext()) { datasetId.put(it.next(), null); } if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { String[] dt = line.split("\t"); if (datasetId.containsKey(dt[0])) { datasetId.put(dt[0], dt[1]); datasetNames += dt[0] + ","; datasetIds += dt[1] + ","; } line = br.readLine(); } br.close(); if (s3) fs.close(); datasetNames = datasetNames.substring(0, datasetNames.length() - 1); datasetIds = datasetIds.substring(0, datasetIds.length() - 1); it = shortDataset.iterator(); while (it.hasNext()) { String dataset = it.next(); if (datasetId.get(dataset) == null) { System.out.println("No dataset id for " + dataset); System.exit(0); } } String firstGroupStr = ""; String secondGroupStr = ""; for (String dataset : firstGroup) { firstGroupStr += datasetId.get(dataset) + ","; } for (String dataset : secondGroup) { secondGroupStr += datasetId.get(dataset) + ","; } firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1); secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1); FrameworkUtils.createDir(s3bucket + FrameworkUtils.correlationTechniquesDir, s3conf, s3); String dataAttributesInputDirs = ""; String noRelationship = ""; HashSet<String> dirs = new HashSet<String>(); String dataset1; String dataset2; String datasetId1; String datasetId2; for (int i = 0; i < firstGroup.size(); i++) { for (int j = 0; j < secondGroup.size(); j++) { if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer .parseInt(datasetId.get(secondGroup.get(j)))) { dataset1 = firstGroup.get(i); dataset2 = secondGroup.get(j); } else { dataset1 = secondGroup.get(j); dataset2 = firstGroup.get(i); } datasetId1 = datasetId.get(dataset1); datasetId2 = datasetId.get(dataset2); if (dataset1.equals(dataset2)) continue; String correlationOutputFileName = s3bucket + FrameworkUtils.correlationTechniquesDir + "/" + dataset1 + "-" + dataset2 + "/"; if (removeExistingFiles) { FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3); } if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) { dirs.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset1); dirs.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset2); } else { noRelationship += datasetId1 + "-" + datasetId2 + ","; } } } if (dirs.isEmpty()) { System.out.println("All the relationships were already computed."); System.out.println("Use -f in the beginning of the command line to force the computation."); System.exit(0); } for (String dir : dirs) { dataAttributesInputDirs += dir + ","; } Configuration conf = new Configuration(); Machine machineConf = new Machine(machine, nbNodes); String jobName = "correlation"; String correlationOutputDir = s3bucket + FrameworkUtils.correlationTechniquesDir + "/tmp/"; FrameworkUtils.removeFile(correlationOutputDir, s3conf, s3); for (int i = 0; i < shortDataset.size(); i++) { conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i))); } for (int i = 0; i < shortDataset.size(); i++) { conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size", Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length)); } conf.set("dataset-keys", datasetIds); conf.set("dataset-names", datasetNames); conf.set("first-group", firstGroupStr); conf.set("second-group", secondGroupStr); conf.set("main-dataset-id", datasetId.get(shortDataset.get(0))); if (noRelationship.length() > 0) { conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1)); } conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1"); conf.set("mapreduce.reduce.shuffle.parallelcopies", "20"); conf.set("mapreduce.input.fileinputformat.split.minsize", "0"); conf.set("mapreduce.task.io.sort.mb", "200"); conf.set("mapreduce.task.io.sort.factor", "100"); conf.set("mapreduce.task.timeout", "2400000"); if (s3) { machineConf.setMachineConfiguration(conf); conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); conf.set("bucket", s3bucket); } if (snappyCompression) { conf.set("mapreduce.map.output.compress", "true"); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); } if (bzip2Compression) { conf.set("mapreduce.map.output.compress", "true"); conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); } Job job = new Job(conf); job.setJobName(jobName); job.setMapOutputKeyClass(PairAttributeWritable.class); job.setMapOutputValueClass(SpatioTemporalValueWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(CorrelationTechniquesMapper.class); job.setReducerClass(CorrelationTechniquesReducer.class); job.setNumReduceTasks(machineConf.getNumberReduces()); job.setInputFormatClass(SequenceFileInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, dataAttributesInputDirs.substring(0, dataAttributesInputDirs.length() - 1)); FileOutputFormat.setOutputPath(job, new Path(correlationOutputDir)); job.setJarByClass(CorrelationTechniques.class); long start = System.currentTimeMillis(); job.submit(); job.waitForCompletion(true); System.out.println(jobName + "\t" + (System.currentTimeMillis() - start)); // moving files to right place for (int i = 0; i < firstGroup.size(); i++) { for (int j = 0; j < secondGroup.size(); j++) { if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer .parseInt(datasetId.get(secondGroup.get(j)))) { dataset1 = firstGroup.get(i); dataset2 = secondGroup.get(j); } else { dataset1 = secondGroup.get(j); dataset2 = firstGroup.get(i); } if (dataset1.equals(dataset2)) continue; String from = s3bucket + FrameworkUtils.correlationTechniquesDir + "/tmp/" + dataset1 + "-" + dataset2 + "/"; String to = s3bucket + FrameworkUtils.correlationTechniquesDir + "/" + dataset1 + "-" + dataset2 + "/"; FrameworkUtils.renameFile(from, to, s3conf, s3); } } }
From source file:edu.nyu.vida.data_polygamy.feature_identification.IndexCreation.java
/** * @param args/*from w w w . j a va 2 s . c o m*/ */ @SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Options options = new Options(); Option forceOption = new Option("f", "force", false, "force the computation of the index and events " + "even if files already exist"); forceOption.setRequired(false); options.addOption(forceOption); Option thresholdOption = new Option("t", "use-custom-thresholds", false, "use custom thresholds for regular and rare events, defined in HDFS_HOME/" + FrameworkUtils.thresholdDir + " file"); thresholdOption.setRequired(false); options.addOption(thresholdOption); Option gOption = new Option("g", "group", true, "set group of datasets for which the indices and events" + " will be computed"); gOption.setRequired(true); gOption.setArgName("GROUP"); gOption.setArgs(Option.UNLIMITED_VALUES); options.addOption(gOption); Option machineOption = new Option("m", "machine", true, "machine identifier"); machineOption.setRequired(true); machineOption.setArgName("MACHINE"); machineOption.setArgs(1); options.addOption(machineOption); Option nodesOption = new Option("n", "nodes", true, "number of nodes"); nodesOption.setRequired(true); nodesOption.setArgName("NODES"); nodesOption.setArgs(1); options.addOption(nodesOption); Option s3Option = new Option("s3", "s3", false, "data on Amazon S3"); s3Option.setRequired(false); options.addOption(s3Option); Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true, "aws access key id; " + "this is required if the execution is on aws"); awsAccessKeyIdOption.setRequired(false); awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID"); awsAccessKeyIdOption.setArgs(1); options.addOption(awsAccessKeyIdOption); Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true, "aws secrect access key; " + "this is required if the execution is on aws"); awsSecretAccessKeyOption.setRequired(false); awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY"); awsSecretAccessKeyOption.setArgs(1); options.addOption(awsSecretAccessKeyOption); Option bucketOption = new Option("b", "s3-bucket", true, "bucket on s3; " + "this is required if the execution is on aws"); bucketOption.setRequired(false); bucketOption.setArgName("S3-BUCKET"); bucketOption.setArgs(1); options.addOption(bucketOption); Option helpOption = new Option("h", "help", false, "display this message"); helpOption.setRequired(false); options.addOption(helpOption); HelpFormatter formatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true); System.exit(0); } if (cmd.hasOption("h")) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true); System.exit(0); } boolean s3 = cmd.hasOption("s3"); String s3bucket = ""; String awsAccessKeyId = ""; String awsSecretAccessKey = ""; if (s3) { if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) { System.out.println( "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS."); formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true); System.exit(0); } s3bucket = cmd.getOptionValue("b"); awsAccessKeyId = cmd.getOptionValue("aws_id"); awsSecretAccessKey = cmd.getOptionValue("aws_key"); } boolean snappyCompression = false; boolean bzip2Compression = false; String machine = cmd.getOptionValue("m"); int nbNodes = Integer.parseInt(cmd.getOptionValue("n")); Configuration s3conf = new Configuration(); if (s3) { s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); s3conf.set("bucket", s3bucket); } String datasetNames = ""; String datasetIds = ""; ArrayList<String> shortDataset = new ArrayList<String>(); ArrayList<String> shortDatasetIndex = new ArrayList<String>(); HashMap<String, String> datasetAgg = new HashMap<String, String>(); HashMap<String, String> datasetId = new HashMap<String, String>(); HashMap<String, HashMap<Integer, Double>> datasetRegThreshold = new HashMap<String, HashMap<Integer, Double>>(); HashMap<String, HashMap<Integer, Double>> datasetRareThreshold = new HashMap<String, HashMap<Integer, Double>>(); Path path = null; FileSystem fs = FileSystem.get(new Configuration()); BufferedReader br; boolean removeExistingFiles = cmd.hasOption("f"); boolean isThresholdUserDefined = cmd.hasOption("t"); for (String dataset : cmd.getOptionValues("g")) { // getting aggregates String[] aggregate = FrameworkUtils.searchAggregates(dataset, s3conf, s3); if (aggregate.length == 0) { System.out.println("No aggregates found for " + dataset + "."); continue; } // getting aggregates header String aggregatesHeaderFileName = FrameworkUtils.searchAggregatesHeader(dataset, s3conf, s3); if (aggregatesHeaderFileName == null) { System.out.println("No aggregate header for " + dataset); continue; } String aggregatesHeader = s3bucket + FrameworkUtils.preProcessingDir + "/" + aggregatesHeaderFileName; shortDataset.add(dataset); datasetId.put(dataset, null); if (s3) { path = new Path(aggregatesHeader); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + aggregatesHeader); } br = new BufferedReader(new InputStreamReader(fs.open(path))); datasetAgg.put(dataset, br.readLine().split("\t")[1]); br.close(); if (s3) fs.close(); } if (shortDataset.size() == 0) { System.out.println("No datasets to process."); System.exit(0); } // getting dataset id if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { String[] dt = line.split("\t"); if (datasetId.containsKey(dt[0])) { datasetId.put(dt[0], dt[1]); datasetNames += dt[0] + ","; datasetIds += dt[1] + ","; } line = br.readLine(); } br.close(); datasetNames = datasetNames.substring(0, datasetNames.length() - 1); datasetIds = datasetIds.substring(0, datasetIds.length() - 1); Iterator<String> it = shortDataset.iterator(); while (it.hasNext()) { String dataset = it.next(); if (datasetId.get(dataset) == null) { System.out.println("No dataset id for " + dataset); System.exit(0); } } // getting user defined thresholds if (isThresholdUserDefined) { if (s3) { path = new Path(s3bucket + FrameworkUtils.thresholdDir); fs = FileSystem.get(path.toUri(), s3conf); } else { path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.thresholdDir); } br = new BufferedReader(new InputStreamReader(fs.open(path))); line = br.readLine(); while (line != null) { // getting dataset name String dataset = line.trim(); HashMap<Integer, Double> regThresholds = new HashMap<Integer, Double>(); HashMap<Integer, Double> rareThresholds = new HashMap<Integer, Double>(); line = br.readLine(); while ((line != null) && (line.split("\t").length > 1)) { // getting attribute ids and thresholds String[] keyVals = line.trim().split("\t"); int att = Integer.parseInt(keyVals[0].trim()); regThresholds.put(att, Double.parseDouble(keyVals[1].trim())); rareThresholds.put(att, Double.parseDouble(keyVals[2].trim())); line = br.readLine(); } datasetRegThreshold.put(dataset, regThresholds); datasetRareThreshold.put(dataset, rareThresholds); } br.close(); } if (s3) fs.close(); // datasets that will use existing merge tree ArrayList<String> useMergeTree = new ArrayList<String>(); // creating index for each spatio-temporal resolution FrameworkUtils.createDir(s3bucket + FrameworkUtils.indexDir, s3conf, s3); HashSet<String> input = new HashSet<String>(); for (String dataset : shortDataset) { String indexCreationOutputFileName = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/"; String mergeTreeFileName = s3bucket + FrameworkUtils.mergeTreeDir + "/" + dataset + "/"; if (removeExistingFiles) { FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3); FrameworkUtils.removeFile(mergeTreeFileName, s3conf, s3); FrameworkUtils.createDir(mergeTreeFileName, s3conf, s3); } else if (datasetRegThreshold.containsKey(dataset)) { FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3); if (FrameworkUtils.fileExists(mergeTreeFileName, s3conf, s3)) { useMergeTree.add(dataset); } } if (!FrameworkUtils.fileExists(indexCreationOutputFileName, s3conf, s3)) { input.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset); shortDatasetIndex.add(dataset); } } if (input.isEmpty()) { System.out.println("All the input datasets have indices."); System.out.println("Use -f in the beginning of the command line to force the computation."); System.exit(0); } String aggregateDatasets = ""; it = input.iterator(); while (it.hasNext()) { aggregateDatasets += it.next() + ","; } Job icJob = null; Configuration icConf = new Configuration(); Machine machineConf = new Machine(machine, nbNodes); String jobName = "index"; String indexOutputDir = s3bucket + FrameworkUtils.indexDir + "/tmp/"; FrameworkUtils.removeFile(indexOutputDir, s3conf, s3); icConf.set("dataset-name", datasetNames); icConf.set("dataset-id", datasetIds); if (!useMergeTree.isEmpty()) { String useMergeTreeStr = ""; for (String dt : useMergeTree) { useMergeTreeStr += dt + ","; } icConf.set("use-merge-tree", useMergeTreeStr.substring(0, useMergeTreeStr.length() - 1)); } for (int i = 0; i < shortDataset.size(); i++) { String dataset = shortDataset.get(i); String id = datasetId.get(dataset); icConf.set("dataset-" + id + "-aggregates", datasetAgg.get(dataset)); if (datasetRegThreshold.containsKey(dataset)) { HashMap<Integer, Double> regThresholds = datasetRegThreshold.get(dataset); String thresholds = ""; for (int att : regThresholds.keySet()) { thresholds += String.valueOf(att) + "-" + String.valueOf(regThresholds.get(att)) + ","; } icConf.set("regular-" + id, thresholds.substring(0, thresholds.length() - 1)); } if (datasetRareThreshold.containsKey(dataset)) { HashMap<Integer, Double> rareThresholds = datasetRareThreshold.get(dataset); String thresholds = ""; for (int att : rareThresholds.keySet()) { thresholds += String.valueOf(att) + "-" + String.valueOf(rareThresholds.get(att)) + ","; } icConf.set("rare-" + id, thresholds.substring(0, thresholds.length() - 1)); } } icConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); icConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); icConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1"); icConf.set("mapreduce.reduce.shuffle.parallelcopies", "20"); icConf.set("mapreduce.input.fileinputformat.split.minsize", "0"); icConf.set("mapreduce.task.io.sort.mb", "200"); icConf.set("mapreduce.task.io.sort.factor", "100"); //icConf.set("mapreduce.task.timeout", "1800000"); machineConf.setMachineConfiguration(icConf); if (s3) { machineConf.setMachineConfiguration(icConf); icConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); icConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); icConf.set("bucket", s3bucket); } if (snappyCompression) { icConf.set("mapreduce.map.output.compress", "true"); icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); } if (bzip2Compression) { icConf.set("mapreduce.map.output.compress", "true"); icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); } icJob = new Job(icConf); icJob.setJobName(jobName); icJob.setMapOutputKeyClass(AttributeResolutionWritable.class); icJob.setMapOutputValueClass(SpatioTemporalFloatWritable.class); icJob.setOutputKeyClass(AttributeResolutionWritable.class); icJob.setOutputValueClass(TopologyTimeSeriesWritable.class); //icJob.setOutputKeyClass(Text.class); //icJob.setOutputValueClass(Text.class); icJob.setMapperClass(IndexCreationMapper.class); icJob.setReducerClass(IndexCreationReducer.class); icJob.setNumReduceTasks(machineConf.getNumberReduces()); icJob.setInputFormatClass(SequenceFileInputFormat.class); //icJob.setOutputFormatClass(SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(icJob, SequenceFileOutputFormat.class); //LazyOutputFormat.setOutputFormatClass(icJob, TextOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(icJob, true); SequenceFileOutputFormat.setOutputCompressionType(icJob, CompressionType.BLOCK); FileInputFormat.setInputDirRecursive(icJob, true); FileInputFormat.setInputPaths(icJob, aggregateDatasets.substring(0, aggregateDatasets.length() - 1)); FileOutputFormat.setOutputPath(icJob, new Path(indexOutputDir)); icJob.setJarByClass(IndexCreation.class); long start = System.currentTimeMillis(); icJob.submit(); icJob.waitForCompletion(true); System.out.println(jobName + "\t" + (System.currentTimeMillis() - start)); // moving files to right place for (String dataset : shortDatasetIndex) { String from = s3bucket + FrameworkUtils.indexDir + "/tmp/" + dataset + "/"; String to = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/"; FrameworkUtils.renameFile(from, to, s3conf, s3); } }
From source file:edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation.java
/** * @param args//from w w w . j av a 2 s .c o m */ @SuppressWarnings({ "deprecation" }) public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Options options = new Options(); Option forceOption = new Option("f", "force", false, "force the computation of the aggregate functions " + "even if files already exist"); forceOption.setRequired(false); options.addOption(forceOption); Option gOption = new Option("g", "group", true, "set group of datasets for which the aggregate functions" + " will be computed, followed by their temporal and spatial attribute indices"); gOption.setRequired(true); gOption.setArgName("GROUP"); gOption.setArgs(Option.UNLIMITED_VALUES); options.addOption(gOption); Option machineOption = new Option("m", "machine", true, "machine identifier"); machineOption.setRequired(true); machineOption.setArgName("MACHINE"); machineOption.setArgs(1); options.addOption(machineOption); Option nodesOption = new Option("n", "nodes", true, "number of nodes"); nodesOption.setRequired(true); nodesOption.setArgName("NODES"); nodesOption.setArgs(1); options.addOption(nodesOption); Option s3Option = new Option("s3", "s3", false, "data on Amazon S3"); s3Option.setRequired(false); options.addOption(s3Option); Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true, "aws access key id; " + "this is required if the execution is on aws"); awsAccessKeyIdOption.setRequired(false); awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID"); awsAccessKeyIdOption.setArgs(1); options.addOption(awsAccessKeyIdOption); Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true, "aws secrect access key; " + "this is required if the execution is on aws"); awsSecretAccessKeyOption.setRequired(false); awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY"); awsSecretAccessKeyOption.setArgs(1); options.addOption(awsSecretAccessKeyOption); Option bucketOption = new Option("b", "s3-bucket", true, "bucket on s3; " + "this is required if the execution is on aws"); bucketOption.setRequired(false); bucketOption.setArgName("S3-BUCKET"); bucketOption.setArgs(1); options.addOption(bucketOption); Option helpOption = new Option("h", "help", false, "display this message"); helpOption.setRequired(false); options.addOption(helpOption); HelpFormatter formatter = new HelpFormatter(); CommandLineParser parser = new PosixParser(); CommandLine cmd = null; try { cmd = parser.parse(options, args); } catch (ParseException e) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true); System.exit(0); } if (cmd.hasOption("h")) { formatter.printHelp("hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true); System.exit(0); } boolean s3 = cmd.hasOption("s3"); String s3bucket = ""; String awsAccessKeyId = ""; String awsSecretAccessKey = ""; if (s3) { if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) { System.out.println( "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS."); formatter.printHelp( "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true); System.exit(0); } s3bucket = cmd.getOptionValue("b"); awsAccessKeyId = cmd.getOptionValue("aws_id"); awsSecretAccessKey = cmd.getOptionValue("aws_key"); } boolean snappyCompression = false; boolean bzip2Compression = false; String machine = cmd.getOptionValue("m"); int nbNodes = Integer.parseInt(cmd.getOptionValue("n")); Configuration s3conf = new Configuration(); if (s3) { s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); s3conf.set("bucket", s3bucket); } String datasetNames = ""; String datasetIds = ""; String preProcessingDatasets = ""; ArrayList<String> shortDataset = new ArrayList<String>(); ArrayList<String> shortDatasetAggregation = new ArrayList<String>(); HashMap<String, String> datasetTempAtt = new HashMap<String, String>(); HashMap<String, String> datasetSpatialAtt = new HashMap<String, String>(); HashMap<String, String> preProcessingDataset = new HashMap<String, String>(); HashMap<String, String> datasetId = new HashMap<String, String>(); boolean removeExistingFiles = cmd.hasOption("f"); String[] datasetArgs = cmd.getOptionValues("g"); for (int i = 0; i < datasetArgs.length; i += 3) { String dataset = datasetArgs[i]; // getting pre-processing String tempPreProcessing = FrameworkUtils.searchPreProcessing(dataset, s3conf, s3); if (tempPreProcessing == null) { System.out.println("No pre-processing available for " + dataset); continue; } preProcessingDataset.put(dataset, tempPreProcessing); shortDataset.add(dataset); datasetTempAtt.put(dataset, ((datasetArgs[i + 1] == "null") ? null : datasetArgs[i + 1])); datasetSpatialAtt.put(dataset, ((datasetArgs[i + 2] == "null") ? null : datasetArgs[i + 2])); datasetId.put(dataset, null); } if (shortDataset.size() == 0) { System.out.println("No datasets to process."); System.exit(0); } // getting dataset id Path path = null; FileSystem fs = null; if (s3) { path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir); fs = FileSystem.get(path.toUri(), s3conf); } else { fs = FileSystem.get(new Configuration()); path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir); } BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); String line = br.readLine(); while (line != null) { String[] dt = line.split("\t"); if (datasetId.containsKey(dt[0])) { datasetId.put(dt[0], dt[1]); datasetNames += dt[0] + ","; datasetIds += dt[1] + ","; } line = br.readLine(); } br.close(); if (s3) fs.close(); datasetNames = datasetNames.substring(0, datasetNames.length() - 1); datasetIds = datasetIds.substring(0, datasetIds.length() - 1); Iterator<String> it = shortDataset.iterator(); while (it.hasNext()) { String dataset = it.next(); if (datasetId.get(dataset) == null) { System.out.println("No dataset id for " + dataset); System.exit(0); } } FrameworkUtils.createDir(s3bucket + FrameworkUtils.aggregatesDir, s3conf, s3); // getting smallest resolution HashMap<String, String> tempResMap = new HashMap<String, String>(); HashMap<String, String> spatialResMap = new HashMap<String, String>(); HashMap<String, String> datasetTemporalStrMap = new HashMap<String, String>(); HashMap<String, String> datasetSpatialStrMap = new HashMap<String, String>(); HashSet<String> input = new HashSet<String>(); for (String dataset : shortDataset) { String[] datasetArray = preProcessingDataset.get(dataset).split("-"); String datasetTemporalStr = datasetArray[datasetArray.length - 2]; int datasetTemporal = utils.temporalResolution(datasetTemporalStr); String datasetSpatialStr = datasetArray[datasetArray.length - 1]; int datasetSpatial = utils.spatialResolution(datasetSpatialStr); // finding all possible resolutions String[] temporalResolutions = FrameworkUtils.getAggTempResolutions(datasetTemporal); String[] spatialResolutions = FrameworkUtils.getAggSpatialResolutions(datasetSpatial); String temporalResolution = ""; String spatialResolution = ""; String tempRes = ""; String spatialRes = ""; boolean dataAdded = false; for (int i = 0; i < temporalResolutions.length; i++) { for (int j = 0; j < spatialResolutions.length; j++) { temporalResolution = temporalResolutions[i]; spatialResolution = spatialResolutions[j]; String aggregatesOutputFileName = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/"; if (removeExistingFiles) { FrameworkUtils.removeFile(aggregatesOutputFileName, s3conf, s3); } if (!FrameworkUtils.fileExists(aggregatesOutputFileName, s3conf, s3)) { dataAdded = true; tempRes += temporalResolution + "-"; spatialRes += spatialResolution + "-"; } } } if (dataAdded) { input.add(s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset)); shortDatasetAggregation.add(dataset); tempResMap.put(dataset, tempRes.substring(0, tempRes.length() - 1)); spatialResMap.put(dataset, spatialRes.substring(0, spatialRes.length() - 1)); datasetTemporalStrMap.put(dataset, datasetTemporalStr); datasetSpatialStrMap.put(dataset, datasetSpatialStr); } } if (input.isEmpty()) { System.out.println("All the input datasets have aggregates."); System.out.println("Use -f in the beginning of the command line to force the computation."); System.exit(0); } it = input.iterator(); while (it.hasNext()) { preProcessingDatasets += it.next() + ","; } Job aggJob = null; String aggregatesOutputDir = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/"; String jobName = "aggregates"; FrameworkUtils.removeFile(aggregatesOutputDir, s3conf, s3); Configuration aggConf = new Configuration(); Machine machineConf = new Machine(machine, nbNodes); aggConf.set("dataset-name", datasetNames); aggConf.set("dataset-id", datasetIds); for (int i = 0; i < shortDatasetAggregation.size(); i++) { String dataset = shortDatasetAggregation.get(i); String id = datasetId.get(dataset); aggConf.set("dataset-" + id + "-temporal-resolutions", tempResMap.get(dataset)); aggConf.set("dataset-" + id + "-spatial-resolutions", spatialResMap.get(dataset)); aggConf.set("dataset-" + id + "-temporal-att", datasetTempAtt.get(dataset)); aggConf.set("dataset-" + id + "-spatial-att", datasetSpatialAtt.get(dataset)); aggConf.set("dataset-" + id + "-temporal", datasetTemporalStrMap.get(dataset)); aggConf.set("dataset-" + id + "-spatial", datasetSpatialStrMap.get(dataset)); if (s3) aggConf.set("dataset-" + id, s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset)); else aggConf.set("dataset-" + id, FileSystem.get(new Configuration()).getHomeDirectory() + "/" + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset)); } aggConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); aggConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks())); aggConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1"); aggConf.set("mapreduce.reduce.shuffle.parallelcopies", "20"); aggConf.set("mapreduce.input.fileinputformat.split.minsize", "0"); aggConf.set("mapreduce.task.io.sort.mb", "200"); aggConf.set("mapreduce.task.io.sort.factor", "100"); machineConf.setMachineConfiguration(aggConf); if (s3) { machineConf.setMachineConfiguration(aggConf); aggConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId); aggConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey); } if (snappyCompression) { aggConf.set("mapreduce.map.output.compress", "true"); aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec"); } if (bzip2Compression) { aggConf.set("mapreduce.map.output.compress", "true"); aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec"); } aggJob = new Job(aggConf); aggJob.setJobName(jobName); aggJob.setMapOutputKeyClass(SpatioTemporalWritable.class); aggJob.setMapOutputValueClass(AggregationArrayWritable.class); aggJob.setOutputKeyClass(SpatioTemporalWritable.class); aggJob.setOutputValueClass(FloatArrayWritable.class); //aggJob.setOutputKeyClass(Text.class); //aggJob.setOutputValueClass(Text.class); aggJob.setMapperClass(AggregationMapper.class); aggJob.setCombinerClass(AggregationCombiner.class); aggJob.setReducerClass(AggregationReducer.class); aggJob.setNumReduceTasks(machineConf.getNumberReduces()); aggJob.setInputFormatClass(SequenceFileInputFormat.class); //aggJob.setOutputFormatClass(SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(aggJob, SequenceFileOutputFormat.class); //LazyOutputFormat.setOutputFormatClass(aggJob, TextOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(aggJob, true); SequenceFileOutputFormat.setOutputCompressionType(aggJob, CompressionType.BLOCK); FileInputFormat.setInputDirRecursive(aggJob, true); FileInputFormat.setInputPaths(aggJob, preProcessingDatasets.substring(0, preProcessingDatasets.length() - 1)); FileOutputFormat.setOutputPath(aggJob, new Path(aggregatesOutputDir)); aggJob.setJarByClass(Aggregation.class); long start = System.currentTimeMillis(); aggJob.submit(); aggJob.waitForCompletion(true); System.out.println(jobName + "\t" + (System.currentTimeMillis() - start)); // moving files to right place for (String dataset : shortDatasetAggregation) { String from = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/" + dataset + "/"; String to = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/"; FrameworkUtils.renameFile(from, to, s3conf, s3); } }
From source file:tuit.java
@SuppressWarnings("ConstantConditions") public static void main(String[] args) { System.out.println(licence);/*w ww . j ava 2 s . com*/ //Declare variables File inputFile; File outputFile; File tmpDir; File blastnExecutable; File properties; File blastOutputFile = null; // TUITPropertiesLoader tuitPropertiesLoader; TUITProperties tuitProperties; // String[] parameters = null; // Connection connection = null; MySQL_Connector mySQL_connector; // Map<Ranks, TUITCutoffSet> cutoffMap; // BLASTIdentifier blastIdentifier = null; // RamDb ramDb = null; CommandLineParser parser = new GnuParser(); Options options = new Options(); options.addOption(tuit.IN, "input<file>", true, "Input file (currently fasta-formatted only)"); options.addOption(tuit.OUT, "output<file>", true, "Output file (in " + tuit.TUIT_EXT + " format)"); options.addOption(tuit.P, "prop<file>", true, "Properties file (XML formatted)"); options.addOption(tuit.V, "verbose", false, "Enable verbose output"); options.addOption(tuit.B, "blast_output<file>", true, "Perform on a pre-BLASTed output"); options.addOption(tuit.DEPLOY, "deploy", false, "Deploy the taxonomic databases"); options.addOption(tuit.UPDATE, "update", false, "Update the taxonomic databases"); options.addOption(tuit.USE_DB, "usedb", false, "Use RDBMS instead of RAM-based taxonomy"); Option option = new Option(tuit.REDUCE, "reduce", true, "Pack identical (100% similar sequences) records in the given sample file"); option.setArgs(Option.UNLIMITED_VALUES); options.addOption(option); option = new Option(tuit.COMBINE, "combine", true, "Combine a set of given reduction files into an HMP Tree-compatible taxonomy"); option.setArgs(Option.UNLIMITED_VALUES); options.addOption(option); options.addOption(tuit.NORMALIZE, "normalize", false, "If used in combination with -combine ensures that the values are normalized by the root value"); HelpFormatter formatter = new HelpFormatter(); try { //Get TUIT directory final File tuitDir = new File( new File(tuit.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()) .getParent()); final File ramDbFile = new File(tuitDir, tuit.RAM_DB); //Setup logger Log.getInstance().setLogName("tuit.log"); //Read command line final CommandLine commandLine = parser.parse(options, args, true); //Check if the REDUCE option is on if (commandLine.hasOption(tuit.REDUCE)) { final String[] fileList = commandLine.getOptionValues(tuit.REDUCE); for (String s : fileList) { final Path path = Paths.get(s); Log.getInstance().log(Level.INFO, "Processing " + path.toString() + "..."); final NucleotideFastaSequenceReductor nucleotideFastaSequenceReductor = NucleotideFastaSequenceReductor .fromPath(path); ReductorFileOperator.save(nucleotideFastaSequenceReductor, path.resolveSibling(path.getFileName().toString() + ".rdc")); } Log.getInstance().log(Level.FINE, "Task done, exiting..."); return; } //Check if COMBINE is on if (commandLine.hasOption(tuit.COMBINE)) { final boolean normalize = commandLine.hasOption(tuit.NORMALIZE); final String[] fileList = commandLine.getOptionValues(tuit.COMBINE); //TODO: implement a test for format here final List<TreeFormatter.TreeFormatterFormat.HMPTreesOutput> hmpTreesOutputs = new ArrayList<>(); final TreeFormatter treeFormatter = TreeFormatter .newInstance(new TreeFormatter.TuitLineTreeFormatterFormat()); for (String s : fileList) { final Path path = Paths.get(s); Log.getInstance().log(Level.INFO, "Merging " + path.toString() + "..."); treeFormatter.loadFromPath(path); final TreeFormatter.TreeFormatterFormat.HMPTreesOutput output = TreeFormatter.TreeFormatterFormat.HMPTreesOutput .newInstance(treeFormatter.toHMPTree(normalize), s.substring(0, s.indexOf("."))); hmpTreesOutputs.add(output); treeFormatter.erase(); } final Path destination; if (commandLine.hasOption(OUT)) { destination = Paths.get(commandLine.getOptionValue(tuit.OUT)); } else { destination = Paths.get("merge.tcf"); } CombinatorFileOperator.save(hmpTreesOutputs, treeFormatter, destination); Log.getInstance().log(Level.FINE, "Task done, exiting..."); return; } if (!commandLine.hasOption(tuit.P)) { throw new ParseException("No properties file option found, exiting."); } else { properties = new File(commandLine.getOptionValue(tuit.P)); } //Load properties tuitPropertiesLoader = TUITPropertiesLoader.newInstanceFromFile(properties); tuitProperties = tuitPropertiesLoader.getTuitProperties(); //Create tmp directory and blastn executable tmpDir = new File(tuitProperties.getTMPDir().getPath()); blastnExecutable = new File(tuitProperties.getBLASTNPath().getPath()); //Check for deploy if (commandLine.hasOption(tuit.DEPLOY)) { if (commandLine.hasOption(tuit.USE_DB)) { NCBITablesDeployer.fastDeployNCBIDatabasesFromNCBI(connection, tmpDir); } else { NCBITablesDeployer.fastDeployNCBIRamDatabaseFromNCBI(tmpDir, ramDbFile); } Log.getInstance().log(Level.FINE, "Task done, exiting..."); return; } //Check for update if (commandLine.hasOption(tuit.UPDATE)) { if (commandLine.hasOption(tuit.USE_DB)) { NCBITablesDeployer.updateDatabasesFromNCBI(connection, tmpDir); } else { //No need to specify a different way to update the database other than just deploy in case of the RAM database NCBITablesDeployer.fastDeployNCBIRamDatabaseFromNCBI(tmpDir, ramDbFile); } Log.getInstance().log(Level.FINE, "Task done, exiting..."); return; } //Connect to the database if (commandLine.hasOption(tuit.USE_DB)) { mySQL_connector = MySQL_Connector.newDefaultInstance( "jdbc:mysql://" + tuitProperties.getDBConnection().getUrl().trim() + "/", tuitProperties.getDBConnection().getLogin().trim(), tuitProperties.getDBConnection().getPassword().trim()); mySQL_connector.connectToDatabase(); connection = mySQL_connector.getConnection(); } else { //Probe for ram database if (ramDbFile.exists() && ramDbFile.canRead()) { Log.getInstance().log(Level.INFO, "Loading RAM taxonomic map..."); try { ramDb = RamDb.loadSelfFromFile(ramDbFile); } catch (IOException ie) { if (ie instanceof java.io.InvalidClassException) throw new IOException("The RAM-based taxonomic database needs to be updated."); } } else { Log.getInstance().log(Level.SEVERE, "The RAM database either has not been deployed, or is not accessible." + "Please use the --deploy option and check permissions on the TUIT directory. " + "If you were looking to use the RDBMS as a taxonomic reference, plese use the -usedb option."); return; } } if (commandLine.hasOption(tuit.B)) { blastOutputFile = new File(commandLine.getOptionValue(tuit.B)); if (!blastOutputFile.exists() || !blastOutputFile.canRead()) { throw new Exception("BLAST output file either does not exist, or is not readable."); } else if (blastOutputFile.isDirectory()) { throw new Exception("BLAST output file points to a directory."); } } //Check vital parameters if (!commandLine.hasOption(tuit.IN)) { throw new ParseException("No input file option found, exiting."); } else { inputFile = new File(commandLine.getOptionValue(tuit.IN)); Log.getInstance().setLogName(inputFile.getName().split("\\.")[0] + ".tuit.log"); } //Correct the output file option if needed if (!commandLine.hasOption(tuit.OUT)) { outputFile = new File((inputFile.getPath()).split("\\.")[0] + tuit.TUIT_EXT); } else { outputFile = new File(commandLine.getOptionValue(tuit.OUT)); } //Adjust the output level if (commandLine.hasOption(tuit.V)) { Log.getInstance().setLevel(Level.FINE); Log.getInstance().log(Level.INFO, "Using verbose output for the log"); } else { Log.getInstance().setLevel(Level.INFO); } //Try all files if (inputFile != null) { if (!inputFile.exists() || !inputFile.canRead()) { throw new Exception("Input file either does not exist, or is not readable."); } else if (inputFile.isDirectory()) { throw new Exception("Input file points to a directory."); } } if (!properties.exists() || !properties.canRead()) { throw new Exception("Properties file either does not exist, or is not readable."); } else if (properties.isDirectory()) { throw new Exception("Properties file points to a directory."); } //Create blast parameters final StringBuilder stringBuilder = new StringBuilder(); for (Database database : tuitProperties.getBLASTNParameters().getDatabase()) { stringBuilder.append(database.getUse()); stringBuilder.append(" ");//Gonna insert an extra space for the last database } String remote; String entrez_query; if (tuitProperties.getBLASTNParameters().getRemote().getDelegate().equals("yes")) { remote = "-remote"; entrez_query = "-entrez_query"; parameters = new String[] { "-db", stringBuilder.toString(), remote, entrez_query, tuitProperties.getBLASTNParameters().getEntrezQuery().getValue(), "-evalue", tuitProperties.getBLASTNParameters().getExpect().getValue() }; } else { if (!commandLine.hasOption(tuit.B)) { if (tuitProperties.getBLASTNParameters().getEntrezQuery().getValue().toUpperCase() .startsWith("NOT") || tuitProperties.getBLASTNParameters().getEntrezQuery().getValue().toUpperCase() .startsWith("ALL")) { parameters = new String[] { "-db", stringBuilder.toString(), "-evalue", tuitProperties.getBLASTNParameters().getExpect().getValue(), "-negative_gilist", TUITFileOperatorHelper.restrictToEntrez(tmpDir, tuitProperties.getBLASTNParameters().getEntrezQuery().getValue() .toUpperCase().replace("NOT", "OR")) .getAbsolutePath(), "-num_threads", tuitProperties.getBLASTNParameters().getNumThreads().getValue() }; } else if (tuitProperties.getBLASTNParameters().getEntrezQuery().getValue().toUpperCase() .equals("")) { parameters = new String[] { "-db", stringBuilder.toString(), "-evalue", tuitProperties.getBLASTNParameters().getExpect().getValue(), "-num_threads", tuitProperties.getBLASTNParameters().getNumThreads().getValue() }; } else { parameters = new String[] { "-db", stringBuilder.toString(), "-evalue", tuitProperties.getBLASTNParameters().getExpect().getValue(), /*"-gilist", TUITFileOperatorHelper.restrictToEntrez( tmpDir, tuitProperties.getBLASTNParameters().getEntrezQuery().getValue()).getAbsolutePath(),*/ //TODO remove comment!!!!! "-num_threads", tuitProperties.getBLASTNParameters().getNumThreads().getValue() }; } } } //Prepare a cutoff Map if (tuitProperties.getSpecificationParameters() != null && tuitProperties.getSpecificationParameters().size() > 0) { cutoffMap = new HashMap<Ranks, TUITCutoffSet>(tuitProperties.getSpecificationParameters().size()); for (SpecificationParameters specificationParameters : tuitProperties .getSpecificationParameters()) { cutoffMap.put(Ranks.valueOf(specificationParameters.getCutoffSet().getRank()), TUITCutoffSet.newDefaultInstance( Double.parseDouble( specificationParameters.getCutoffSet().getPIdentCutoff().getValue()), Double.parseDouble(specificationParameters.getCutoffSet() .getQueryCoverageCutoff().getValue()), Double.parseDouble( specificationParameters.getCutoffSet().getAlpha().getValue()))); } } else { cutoffMap = new HashMap<Ranks, TUITCutoffSet>(); } final TUITFileOperatorHelper.OutputFormat format; if (tuitProperties.getBLASTNParameters().getOutputFormat().getFormat().equals("rdp")) { format = TUITFileOperatorHelper.OutputFormat.RDP_FIXRANK; } else { format = TUITFileOperatorHelper.OutputFormat.TUIT; } try (TUITFileOperator<NucleotideFasta> nucleotideFastaTUITFileOperator = NucleotideFastaTUITFileOperator .newInstance(format, cutoffMap);) { nucleotideFastaTUITFileOperator.setInputFile(inputFile); nucleotideFastaTUITFileOperator.setOutputFile(outputFile); final String cleanupString = tuitProperties.getBLASTNParameters().getKeepBLASTOuts().getKeep(); final boolean cleanup; if (cleanupString.equals("no")) { Log.getInstance().log(Level.INFO, "Temporary BLAST files will be deleted."); cleanup = true; } else { Log.getInstance().log(Level.INFO, "Temporary BLAST files will be kept."); cleanup = false; } //Create blast identifier ExecutorService executorService = Executors.newSingleThreadExecutor(); if (commandLine.hasOption(tuit.USE_DB)) { if (blastOutputFile == null) { blastIdentifier = TUITBLASTIdentifierDB.newInstanceFromFileOperator(tmpDir, blastnExecutable, parameters, nucleotideFastaTUITFileOperator, connection, cutoffMap, Integer.parseInt( tuitProperties.getBLASTNParameters().getMaxFilesInBatch().getValue()), cleanup); } else { try { blastIdentifier = TUITBLASTIdentifierDB.newInstanceFromBLASTOutput( nucleotideFastaTUITFileOperator, connection, cutoffMap, blastOutputFile, Integer.parseInt( tuitProperties.getBLASTNParameters().getMaxFilesInBatch().getValue()), cleanup); } catch (JAXBException e) { Log.getInstance().log(Level.SEVERE, "Error reading " + blastOutputFile.getName() + ", please check input. The file must be XML formatted."); } catch (Exception e) { e.printStackTrace(); } } } else { if (blastOutputFile == null) { blastIdentifier = TUITBLASTIdentifierRAM.newInstanceFromFileOperator(tmpDir, blastnExecutable, parameters, nucleotideFastaTUITFileOperator, cutoffMap, Integer.parseInt( tuitProperties.getBLASTNParameters().getMaxFilesInBatch().getValue()), cleanup, ramDb); } else { try { blastIdentifier = TUITBLASTIdentifierRAM.newInstanceFromBLASTOutput( nucleotideFastaTUITFileOperator, cutoffMap, blastOutputFile, Integer.parseInt( tuitProperties.getBLASTNParameters().getMaxFilesInBatch().getValue()), cleanup, ramDb); } catch (JAXBException e) { Log.getInstance().log(Level.SEVERE, "Error reading " + blastOutputFile.getName() + ", please check input. The file must be XML formatted."); } catch (Exception e) { e.printStackTrace(); } } } Future<?> runnableFuture = executorService.submit(blastIdentifier); runnableFuture.get(); executorService.shutdown(); } } catch (ParseException pe) { Log.getInstance().log(Level.SEVERE, (pe.getMessage())); formatter.printHelp("tuit", options); } catch (SAXException saxe) { Log.getInstance().log(Level.SEVERE, saxe.getMessage()); } catch (FileNotFoundException fnfe) { Log.getInstance().log(Level.SEVERE, fnfe.getMessage()); } catch (TUITPropertyBadFormatException tpbfe) { Log.getInstance().log(Level.SEVERE, tpbfe.getMessage()); } catch (ClassCastException cce) { Log.getInstance().log(Level.SEVERE, cce.getMessage()); } catch (JAXBException jaxbee) { Log.getInstance().log(Level.SEVERE, "The properties file is not well formatted. Please ensure that the XML is consistent with the io.properties.dtd schema."); } catch (ClassNotFoundException cnfe) { //Probably won't happen unless the library deleted from the .jar Log.getInstance().log(Level.SEVERE, cnfe.getMessage()); //cnfe.printStackTrace(); } catch (SQLException sqle) { Log.getInstance().log(Level.SEVERE, "A database communication error occurred with the following message:\n" + sqle.getMessage()); //sqle.printStackTrace(); if (sqle.getMessage().contains("Access denied for user")) { Log.getInstance().log(Level.SEVERE, "Please use standard database login: " + NCBITablesDeployer.login + " and password: " + NCBITablesDeployer.password); } } catch (Exception e) { Log.getInstance().log(Level.SEVERE, e.getMessage()); e.printStackTrace(); } finally { if (connection != null) { try { connection.close(); } catch (SQLException sqle) { Log.getInstance().log(Level.SEVERE, "Problem closing the database connection: " + sqle); } } Log.getInstance().log(Level.FINE, "Task done, exiting..."); } }