Example usage for org.apache.commons.cli Option setArgs

List of usage examples for org.apache.commons.cli Option setArgs

Introduction

In this page you can find the example usage for org.apache.commons.cli Option setArgs.

Prototype

public void setArgs(int num) 

Source Link

Document

Sets the number of argument values this Option can take.

Usage

From source file:knowledgeMiner.preprocessing.KnowledgeMinerPreprocessor.java

/**
 * Main method for running the CycMiner in precomputation or verbose mode.
 * /*from   ww w .jav a 2 s .c o m*/
 * @param args
 *            The args to determine which mode to run in.
 */
public static void main(String[] args) {
    ENABLE_PREPROCESSING = true;
    Options options = new Options();
    options.addOption("m", false, "If precomputing mined data.");
    options.addOption("w", false, "If mapping from article to ontology.");
    options.addOption("o", false, "If mapping from ontology to article.");
    options.addOption("f", false, "Force all heuristics to be run, even if they have stored results.");
    options.addOption("i", true, "The article/concept to mine. Defaults to all.");
    options.addOption("R", false, "If the process should run in reverse order.");
    Option heurOption = new Option("h", true, "The heuristic(s) to use. Defaults to all.");
    heurOption.setArgs(20);
    options.addOption(heurOption);

    CommandLineParser parser = new BasicParser();
    try {
        CommandLine parse = parser.parse(options, args);
        KnowledgeMinerPreprocessor kmp = getInstance();
        kmp.run(parse);
    } catch (Exception e) {
        e.printStackTrace();
        System.exit(1);
    }
}

From source file:com.basistech.ninja.Train.java

/**
 * Command line interface to train a model.
 *
 * <pre>//from www  . jav a  2 s. com
 *  usage: Train [options]
 *  --batch-size <arg>      batch size (default = 10)
 *  --epochs <arg>          epochs (default = 5)
 *  --examples <arg>        input examples file (required)
 *  --layer-sizes <arg>     layer sizes, including input/output, e.g. 3 4 2 (required)
 *  --learning-rate <arg>   learning-rate (default = 0.7)
 *  --model <arg>           output model file (required)
 * </pre>
 *
 * @param args command line arguments
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    String defaultBatchSize = "10";
    String deafaultEpochs = "5";
    String defaultLearningRate = "0.7";

    Options options = new Options();
    Option option;
    option = new Option(null, "examples", true, "input examples file (required)");
    option.setRequired(true);
    options.addOption(option);
    option = new Option(null, "model", true, "output model file (required)");
    option.setRequired(true);
    options.addOption(option);
    option = new Option(null, "layer-sizes", true,
            "layer sizes, including input/output, e.g. 3 4 2 (required)");
    option.setRequired(true);
    option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(option);
    option = new Option(null, "batch-size", true, String.format("batch size (default = %s)", defaultBatchSize));
    options.addOption(option);
    option = new Option(null, "epochs", true, String.format("epochs (default = %s)", deafaultEpochs));
    options.addOption(option);
    option = new Option(null, "learning-rate", true,
            String.format("learning-rate (default = %s)", defaultLearningRate));
    options.addOption(option);

    CommandLineParser parser = new GnuParser();
    CommandLine cmdline = null;
    try {
        cmdline = parser.parse(options, args);
    } catch (org.apache.commons.cli.ParseException e) {
        System.err.println(e.getMessage());
        usage(options);
        System.exit(1);
    }
    String[] remaining = cmdline.getArgs();
    if (remaining == null) {
        usage(options);
        System.exit(1);
    }

    List<Integer> layerSizes = Lists.newArrayList();
    for (String s : cmdline.getOptionValues("layer-sizes")) {
        layerSizes.add(Integer.parseInt(s));
    }

    File examplesFile = new File(cmdline.getOptionValue("examples"));
    Train that = new Train(layerSizes, examplesFile);
    int batchSize = Integer.parseInt(cmdline.getOptionValue("batch-size", defaultBatchSize));
    int epochs = Integer.parseInt(cmdline.getOptionValue("epochs", deafaultEpochs));
    double learningRate = Double.parseDouble(cmdline.getOptionValue("learning-rate", defaultLearningRate));
    File modelFile = new File(cmdline.getOptionValue("model"));

    that.train(batchSize, epochs, learningRate, modelFile);
}

From source file:edu.ksu.cis.indus.staticanalyses.callgraphs.CallGraphXMLizerCLI.java

/**
 * The entry point to the program via command line.
 * //from   w  ww. java  2 s  . c om
 * @param args is the command line arguments.
 * @throws RuntimeException when the analyses fail.
 */
public static void main(final String[] args) {
    final Options _options = new Options();
    Option _option = new Option("c", "cumulative", false,
            "Builds one call graph that includes all root methods.");
    _options.addOption(_option);
    _option = new Option("o", "output", true,
            "Directory into which xml files will be written into.  Defaults to current directory if omitted");
    _option.setArgs(1);
    _option.setArgName("output-dir");
    _options.addOption(_option);
    _option = new Option("j", "jimple", false, "Dump xmlized jimple.");
    _option.setArgName("dump-jimple");
    _options.addOption(_option);
    _option = new Option("p", "soot-classpath", true, "Prepend this to soot class path.");
    _option.setArgs(1);
    _option.setArgName("classpath");
    _option.setOptionalArg(false);
    _options.addOption(_option);
    _option = new Option("h", "help", false, "Display message.");
    _option.setOptionalArg(false);
    _options.addOption(_option);
    _option = new Option("t", "call-graph-type", true,
            "Call graph type.  This has to be one of {cha, rta, ofa-oi, " + "ofa-oirt, ofa-os}.");
    _option.setArgs(1);
    _option.setArgName("type");
    _option.setRequired(true);
    _options.addOption(_option);
    _option = new Option("S", "scope", true, "The scope that should be analyzed.");
    _option.setArgs(1);
    _option.setArgName("scope");
    _option.setRequired(false);
    _options.addOption(_option);

    final PosixParser _parser = new PosixParser();

    try {
        final CommandLine _cl = _parser.parse(_options, args);

        if (_cl.hasOption('h')) {
            printUsage(_options);
            System.exit(1);
        }

        String _outputDir = _cl.getOptionValue('o');

        if (_outputDir == null) {
            if (LOGGER.isWarnEnabled()) {
                LOGGER.warn("Defaulting to current directory for output.");
            }
            _outputDir = ".";
        }

        if (_cl.getArgList().isEmpty()) {
            throw new MissingArgumentException("Please specify atleast one class.");
        }

        final CallGraphXMLizerCLI _cli = new CallGraphXMLizerCLI();

        _cli.xmlizer.setXmlOutputDir(_outputDir);
        _cli.xmlizer.setGenerator(new UniqueJimpleIDGenerator());
        _cli.setCumulative(_cl.hasOption('c'));
        _cli.setClassNames(_cl.getArgList());
        _cli.addToSootClassPath(_cl.getOptionValue('p'));

        if (_cl.hasOption('S')) {
            _cli.setScopeSpecFile(_cl.getOptionValue('S'));
        }

        _cli.initialize();

        _cli.execute(_cl.hasOption('j'), _cl.getOptionValue('t'));
    } catch (final ParseException _e) {
        LOGGER.error("Error while parsing command line.", _e);
        System.out.println("Error while parsing command line." + _e);
        printUsage(_options);
    } catch (final Throwable _e) {
        LOGGER.error("Beyond our control. May day! May day!", _e);
        throw new RuntimeException(_e);
    }
}

From source file:edu.ksu.cis.indus.staticanalyses.dependency.DependencyXMLizerCLI.java

/**
 * This is the entry point via command-line.
 * //from  ww w .  j ava  2  s . c o m
 * @param args is the command line arguments.
 * @throws RuntimeException when an Throwable exception beyond our control occurs.
 * @pre args != null
 */
public static void main(final String[] args) {
    final Options _options = new Options();
    Option _option = new Option("o", "output", true,
            "Directory into which xml files will be written into.  Defaults to current directory if omitted");
    _option.setArgs(1);
    _option.setArgName("output-directory");
    _options.addOption(_option);
    _option = new Option("j", "jimple", false, "Dump xmlized jimple.");
    _options.addOption(_option);

    final DivergenceDA _fidda = DivergenceDA.getDivergenceDA(IDependencyAnalysis.Direction.FORWARD_DIRECTION);
    _fidda.setConsiderCallSites(true);

    final DivergenceDA _bidda = DivergenceDA.getDivergenceDA(IDependencyAnalysis.Direction.BACKWARD_DIRECTION);
    _bidda.setConsiderCallSites(true);

    final NonTerminationSensitiveEntryControlDA _ncda = new NonTerminationSensitiveEntryControlDA();
    final Object[][] _dasOptions = {
            { "ibdda1", "Identifier based data dependence (Soot)", new IdentifierBasedDataDA() },
            { "ibdda2", "Identifier based data dependence (Indus)", new IdentifierBasedDataDAv2() },
            { "ibdda3", "Identifier based data dependence (Indus Optimized)", new IdentifierBasedDataDAv3() },
            { "rbdda", "Reference based data dependence", new ReferenceBasedDataDA() },
            { "nscda", "Non-termination sensitive Entry control dependence", _ncda },
            { "nicda", "Non-termination insensitive Entry control dependence",
                    new NonTerminationInsensitiveEntryControlDA(), },
            { "xcda", "Exit control dependence", new ExitControlDA() },
            { "sda", "Synchronization dependence", new SynchronizationDA() },
            { "frda1", "Forward Ready dependence v1", ReadyDAv1.getForwardReadyDA() },
            { "brda1", "Backward Ready dependence v1", ReadyDAv1.getBackwardReadyDA() },
            { "frda2", "Forward Ready dependence v2", ReadyDAv2.getForwardReadyDA() },
            { "brda2", "Backward Ready dependence v2", ReadyDAv2.getBackwardReadyDA() },
            { "frda3", "Forward Ready dependence v3", ReadyDAv3.getForwardReadyDA() },
            { "brda3", "Backward Ready dependence v3", ReadyDAv3.getBackwardReadyDA() },
            { "ida1", "Interference dependence v1", new InterferenceDAv1() },
            { "ida2", "Interference dependence v2", new InterferenceDAv2() },
            { "ida3", "Interference dependence v3", new InterferenceDAv3() },
            { "fdda", "Forward Intraprocedural Divergence dependence",
                    DivergenceDA.getDivergenceDA(IDependencyAnalysis.Direction.FORWARD_DIRECTION), },
            { "bdda", "Backward Intraprocedural Divergence dependence",
                    DivergenceDA.getDivergenceDA(IDependencyAnalysis.Direction.BACKWARD_DIRECTION), },
            { "fidda", "Forward Intra+Interprocedural Divergence dependence", _fidda },
            { "bidda", "Backward Intra+Interprocedural Divergence dependence", _bidda },
            { "fpidda", "Forward Interprocedural Divergence dependence",
                    InterProceduralDivergenceDA
                            .getDivergenceDA(IDependencyAnalysis.Direction.FORWARD_DIRECTION), },
            { "bpidda", "Backward Interprocedural Divergence dependence", InterProceduralDivergenceDA
                    .getDivergenceDA(IDependencyAnalysis.Direction.BACKWARD_DIRECTION), }, };
    _option = new Option("h", "help", false, "Display message.");
    _option.setOptionalArg(false);
    _options.addOption(_option);
    _option = new Option("p", "soot-classpath", false, "Prepend this to soot class path.");
    _option.setArgs(1);
    _option.setArgName("classpath");
    _option.setOptionalArg(false);
    _options.addOption(_option);
    _option = new Option("aliasedusedefv1", false, "Use version 1 of aliased use-def info.");
    _options.addOption(_option);
    _option = new Option("safelockanalysis", false, "Use safe-lock-analysis for ready dependence.");
    _options.addOption(_option);
    _option = new Option("ofaforinterference", false, "Use OFA for interference dependence.");
    _options.addOption(_option);
    _option = new Option("ofaforready", false, "Use OFA for ready dependence.");
    _options.addOption(_option);
    _option = new Option("exceptionalexits", false, "Consider exceptional exits for control dependence.");
    _options.addOption(_option);
    _option = new Option("commonuncheckedexceptions", false, "Consider common unchecked exceptions.");
    _options.addOption(_option);
    _option = new Option("S", "scope", true, "The scope that should be analyzed.");
    _option.setArgs(1);
    _option.setArgName("scope");
    _option.setRequired(false);
    _options.addOption(_option);

    for (int _i = 0; _i < _dasOptions.length; _i++) {
        final String _shortOption = _dasOptions[_i][0].toString();
        final String _description = _dasOptions[_i][1].toString();
        _option = new Option(_shortOption, false, _description);
        _options.addOption(_option);
    }

    final CommandLineParser _parser = new GnuParser();

    try {
        final CommandLine _cl = _parser.parse(_options, args);

        if (_cl.hasOption("h")) {
            printUsage(_options);
            System.exit(1);
        }

        final DependencyXMLizerCLI _xmlizerCLI = new DependencyXMLizerCLI();
        String _outputDir = _cl.getOptionValue('o');

        if (_outputDir == null) {
            if (LOGGER.isWarnEnabled()) {
                LOGGER.warn("Defaulting to current directory for output.");
            }
            _outputDir = ".";
        }

        _xmlizerCLI.xmlizer.setXmlOutputDir(_outputDir);

        if (_cl.hasOption('p')) {
            _xmlizerCLI.addToSootClassPath(_cl.getOptionValue('p'));
        }

        if (_cl.hasOption('S')) {
            _xmlizerCLI.setScopeSpecFile(_cl.getOptionValue('S'));
        }

        _xmlizerCLI.dumpJimple = _cl.hasOption('j');
        _xmlizerCLI.useAliasedUseDefv1 = _cl.hasOption("aliasedusedefv1");
        _xmlizerCLI.useSafeLockAnalysis = _cl.hasOption("safelockanalysis");
        _xmlizerCLI.exceptionalExits = _cl.hasOption("exceptionalexits");
        _xmlizerCLI.commonUncheckedException = _cl.hasOption("commonuncheckedexceptions");

        final List<String> _classNames = _cl.getArgList();

        if (_classNames.isEmpty()) {
            throw new MissingArgumentException("Please specify atleast one class.");
        }
        _xmlizerCLI.setClassNames(_classNames);

        final int _exitControlDAIndex = 6;

        if (_cl.hasOption(_dasOptions[_exitControlDAIndex][0].toString())) {
            _xmlizerCLI.das.add(_ncda);

            for (final Iterator<DependenceSort> _i = _ncda.getIds().iterator(); _i.hasNext();) {
                final DependenceSort _id = _i.next();
                MapUtils.putIntoCollectionInMapUsingFactory(_xmlizerCLI.info, _id, _ncda,
                        SetUtils.getFactory());
            }
        }

        if (!parseForDependenceOptions(_dasOptions, _cl, _xmlizerCLI)) {
            throw new ParseException("Atleast one dependence analysis must be requested.");
        }

        _xmlizerCLI.<ITokens>execute();
    } catch (final ParseException _e) {
        LOGGER.error("Error while parsing command line.", _e);
        System.out.println("Error while parsing command line." + _e);
        printUsage(_options);
    } catch (final Throwable _e) {
        LOGGER.error("Beyond our control. May day! May day!", _e);
        throw new RuntimeException(_e);
    }
}

From source file:LineageSimulator.java

public static void main(String[] args) {
    Options options = new Options();
    // commands//  w  w w.j  av  a  2 s .co  m
    //options.addOption("simulate", false, "Simulate lineage trees");
    //options.addOption("sample", false, "Sample from the simulated trees");
    //options.addOption("evaluate", false, "Evaluate trees");

    // tree simulation
    options.addOption("t", "nTrees", true, "Number of trees to simulate (default: 100)");
    options.addOption("i", "nIter", true, "Number of tree growth iterations (default: 50)");
    options.addOption("snv", "probSNV", true,
            "Per node probablity of generating a descendant cell population with an acquired SNV during a tree growth iteration (default: 0.15)");
    options.addOption("cnv", "probCNV", true,
            "Per node probablity of generating a descendant cell population with an acquired CNV during a tree growth iteration (default: 0.02)");
    options.addOption("probDeath", true,
            "Probablity of a cell population death in each tree growth iteration (default: 0.06)");
    options.addOption("maxPopulationSize", true, "Max size of a cell population (default: 1000000)");
    options.addOption("minNodes", true,
            "Minimum number of undead cell population nodes in a valid tree, tree growth will continue beyond the defined number of iterations until this value is reached (default: 10)");
    options.addOption("maxNodes", true,
            "Maximum number of undead cell population nodes in a tree, tree growth will stop after the iteration in which this value is reached/first surpassed (default: 1000)");

    // sampling
    Option samplesOption = new Option("s", "nSamples", true,
            "Number of samples to collect, accepts multiple values, e.g. 5 10 15 (default: 5)");
    samplesOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(samplesOption);
    Option covOption = new Option("c", "coverage", true,
            "Simulated coverage to generate the VAFs, accepts multiple values, e.g. 500 1000 (default: 1000)");
    covOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(covOption);
    options.addOption("maxSubclones", true, "Max number of subclones per sample (default: 5)");
    options.addOption("sampleSize", true, "Number of cells per sample (default: 100000)");
    options.addOption("e", true, "Sequencing error (default: 0.001)");
    options.addOption("minNC", true,
            "Minimum percentage of normal contamination per sample; the percentage will be randomly generated from the range [minNC maxNC] for each sample (default: 0)");
    options.addOption("maxNC", true,
            "Maximum percentage of normal contamination per sample; if maxNC < minNC, maxNC will be automatically set to minNC; the percentage will be randomly generated from the range [minNC maxNC] for each sample (default: 20)");
    //options.addOption("localized", false, "Enable localized sampling (default: random sampling)");
    //options.addOption("mixSubclone", false, "With localized sampling, add an additional subclone from a different subtree to each sample; by default, the sample is localized to a single disjoint subtree");

    // input/output/display
    options.addOption("dir", "outputDir", true,
            "Directory where the output files should be created [required]");
    options.addOption("dot", false, "Produce DOT files for the simulated trees");
    options.addOption("sdot", "sampledDot", false,
            "Produce DOT files for the simulated trees with indicated samples");
    options.addOption("sampleProfile", false,
            "Output VAF file includes an additional column with the binary sample profile for each SNV");

    // other
    options.addOption("v", "verbose", false, "Verbose mode");
    options.addOption("h", "help", false, "Print usage");

    // display order
    ArrayList<Option> optionsList = new ArrayList<Option>();
    optionsList.add(options.getOption("dir"));
    optionsList.add(options.getOption("t"));
    optionsList.add(options.getOption("i"));
    optionsList.add(options.getOption("snv"));
    optionsList.add(options.getOption("cnv"));
    optionsList.add(options.getOption("probDeath"));
    optionsList.add(options.getOption("maxPopulationSize"));
    optionsList.add(options.getOption("minNodes"));
    optionsList.add(options.getOption("maxNodes"));
    optionsList.add(options.getOption("s"));
    optionsList.add(options.getOption("c"));
    optionsList.add(options.getOption("maxSubclones"));
    optionsList.add(options.getOption("sampleSize"));
    optionsList.add(options.getOption("e"));
    optionsList.add(options.getOption("minNC"));
    optionsList.add(options.getOption("maxNC"));
    optionsList.add(options.getOption("dot"));
    optionsList.add(options.getOption("sdot"));
    optionsList.add(options.getOption("sampleProfile"));
    optionsList.add(options.getOption("v"));
    optionsList.add(options.getOption("h"));

    CommandLineParser parser = new BasicParser();
    CommandLine cmdLine = null;
    HelpFormatter hf = new HelpFormatter();
    hf.setOptionComparator(new OptionComarator<Option>(optionsList));
    try {
        cmdLine = parser.parse(options, args);
    } catch (ParseException e) {
        System.err.println(e.getMessage());
        hf.printHelp(PROG_NAME, options);
        System.exit(-1);
    }
    Args params = new Args();
    if (cmdLine.hasOption("dir")) {
        params.simPath = cmdLine.getOptionValue("dir") + "/" + SIMULATION_DATA_DIR;
    } else {
        System.err.println("Required parameter: output directory path [-dir]");
        hf.printHelp(PROG_NAME, options);
        System.exit(-1);
    }
    if (cmdLine.hasOption("t")) {
        Parameters.NUM_TREES = Integer.parseInt(cmdLine.getOptionValue("t"));
    }
    if (cmdLine.hasOption("i")) {
        Parameters.NUM_ITERATIONS = Integer.parseInt(cmdLine.getOptionValue("i"));
    }
    if (cmdLine.hasOption("snv")) {
        Parameters.PROB_SNV = Double.parseDouble(cmdLine.getOptionValue("snv"));
    }
    if (cmdLine.hasOption("cnv")) {
        Parameters.PROB_CNV = Double.parseDouble(cmdLine.getOptionValue("cnv"));
    }
    if (cmdLine.hasOption("probDeath")) {
        Parameters.PROB_DEATH = Double.parseDouble(cmdLine.getOptionValue("probDeath"));
    }
    if (cmdLine.hasOption("maxPopulationSize")) {
        Parameters.MAX_POPULATION_SIZE = Integer.parseInt(cmdLine.getOptionValue("maxPopulationSize"));
    }
    if (cmdLine.hasOption("minNodes")) {
        Parameters.MIN_NUM_NODES = Integer.parseInt(cmdLine.getOptionValue("minNodes"));
        if (Parameters.MIN_NUM_NODES < 1) {
            System.err.println("Minimum number of nodes [-minNodes] must be at least 1");
            System.exit(-1);
        }
    }
    if (cmdLine.hasOption("maxNodes")) {
        Parameters.MAX_NUM_NODES = Integer.parseInt(cmdLine.getOptionValue("maxNodes"));
        if (Parameters.MAX_NUM_NODES < 1 || Parameters.MAX_NUM_NODES < Parameters.MIN_NUM_NODES) {
            System.err.println(
                    "Maximum number of nodes [-maxNodes] must be at least 1 and not less than [-minNodes]");
            System.exit(-1);
        }
    }
    if (cmdLine.hasOption("s")) {
        String[] samples = cmdLine.getOptionValues("s");
        Parameters.NUM_SAMPLES_ARRAY = new int[samples.length];
        for (int i = 0; i < samples.length; i++) {
            Parameters.NUM_SAMPLES_ARRAY[i] = Integer.parseInt(samples[i]);
        }
    }
    if (cmdLine.hasOption("c")) {
        String[] cov = cmdLine.getOptionValues("c");
        Parameters.COVERAGE_ARRAY = new int[cov.length];
        for (int i = 0; i < cov.length; i++) {
            Parameters.COVERAGE_ARRAY[i] = Integer.parseInt(cov[i]);
        }
    }
    if (cmdLine.hasOption("maxSubclones")) {
        Parameters.MAX_NUM_SUBCLONES = Integer.parseInt(cmdLine.getOptionValue("maxSubclones"));
    }
    if (cmdLine.hasOption("sampleSize")) {
        Parameters.NUM_CELLS_PER_SAMPLE = Integer.parseInt(cmdLine.getOptionValue("sampleSize"));
    }
    if (cmdLine.hasOption("e")) {
        Parameters.SEQUENCING_ERROR = Double.parseDouble(cmdLine.getOptionValue("e"));
    }
    if (cmdLine.hasOption("minNC")) {
        Parameters.MIN_PERCENT_NORMAL_CONTAMINATION = Double.parseDouble(cmdLine.getOptionValue("minNC"));
    }
    if (cmdLine.hasOption("maxNC")) {
        Parameters.MAX_PERCENT_NORMAL_CONTAMINATION = Double.parseDouble(cmdLine.getOptionValue("maxNC"));
    }
    if (Parameters.MAX_PERCENT_NORMAL_CONTAMINATION < Parameters.MIN_PERCENT_NORMAL_CONTAMINATION) {
        Parameters.MAX_PERCENT_NORMAL_CONTAMINATION = Parameters.MIN_PERCENT_NORMAL_CONTAMINATION;
    }

    /*if(cmdLine.hasOption("localized")) {
       Parameters.LOCALIZED_SAMPLING = true;
    }
    if(cmdLine.hasOption("mixSubclone")) {
       Parameters.MIX_NBR_SUBTREE_SUBCLONE = true;
    }*/

    if (cmdLine.hasOption("dot")) {
        params.generateDOT = true;
    }
    if (cmdLine.hasOption("sampledDot")) {
        params.generateSampledDOT = true;
    }
    if (cmdLine.hasOption("sampleProfile")) {
        params.outputSampleProfile = true;
    }
    if (cmdLine.hasOption("h")) {
        new HelpFormatter().printHelp(" ", options);
    }
    // logger
    ConsoleHandler h = new ConsoleHandler();
    h.setFormatter(new LogFormatter());
    h.setLevel(Level.INFO);
    logger.setLevel(Level.INFO);
    if (cmdLine.hasOption("v")) {
        h.setLevel(Level.FINEST);
        logger.setLevel(Level.FINEST);
    }
    logger.addHandler(h);
    logger.setUseParentHandlers(false);

    // validate settings
    if (Parameters.PROB_SNV + Parameters.PROB_CNV + Parameters.PROB_DEATH > 1) {
        System.err.println("The sum of SSNV, CNV, and cell death probabilities cannot exceed 1");
        hf.printHelp(PROG_NAME, options);
        System.exit(-1);
    }
    simulateLineageTrees(params);
}

From source file:edu.nyu.vida.data_polygamy.pre_processing.PreProcessing.java

/**
 * @param args//from   w w  w . ja v  a  2 s. c o  m
 * @throws IOException 
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
@SuppressWarnings("deprecation")
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option nameOption = new Option("dn", "name", true, "the name of the dataset");
    nameOption.setRequired(true);
    nameOption.setArgName("DATASET NAME");
    options.addOption(nameOption);

    Option headerOption = new Option("dh", "header", true, "the file that contains the header of the dataset");
    headerOption.setRequired(true);
    headerOption.setArgName("DATASET HEADER FILE");
    options.addOption(headerOption);

    Option deafultsOption = new Option("dd", "defaults", true,
            "the file that contains the default values of the dataset");
    deafultsOption.setRequired(true);
    deafultsOption.setArgName("DATASET DEFAULTS FILE");
    options.addOption(deafultsOption);

    Option tempResOption = new Option("t", "temporal", true,
            "desired temporal resolution (hour, day, week, or month)");
    tempResOption.setRequired(true);
    tempResOption.setArgName("TEMPORAL RESOLUTION");
    options.addOption(tempResOption);

    Option spatialResOption = new Option("s", "spatial", true,
            "desired spatial resolution (points, nbhd, zip, grid, or city)");
    spatialResOption.setRequired(true);
    spatialResOption.setArgName("SPATIAL RESOLUTION");
    options.addOption(spatialResOption);

    Option currentSpatialResOption = new Option("cs", "current-spatial", true,
            "current spatial resolution (points, nbhd, zip, grid, or city)");
    currentSpatialResOption.setRequired(true);
    currentSpatialResOption.setArgName("CURRENT SPATIAL RESOLUTION");
    options.addOption(currentSpatialResOption);

    Option indexResOption = new Option("i", "index", true, "indexes of the temporal and spatial attributes");
    indexResOption.setRequired(true);
    indexResOption.setArgName("INDEX OF SPATIO-TEMPORAL RESOLUTIONS");
    indexResOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(indexResOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.pre_processing.PreProcessing",
                options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.pre_processing.PreProcessing",
                options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.pre_processing.PreProcessing",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    Configuration conf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);
    String dataset = cmd.getOptionValue("dn");
    String header = cmd.getOptionValue("dh");
    String defaults = cmd.getOptionValue("dd");
    String temporalResolution = cmd.getOptionValue("t");
    String spatialResolution = cmd.getOptionValue("s");
    String gridResolution = "";
    String currentSpatialResolution = cmd.getOptionValue("cs");

    if (spatialResolution.contains("grid")) {
        String[] res = spatialResolution.split("-");
        spatialResolution = res[0];
        gridResolution = res[1];
    }

    conf.set("header", s3bucket + FrameworkUtils.dataDir + "/" + header);
    conf.set("defaults", s3bucket + FrameworkUtils.dataDir + "/" + defaults);
    conf.set("temporal-resolution", temporalResolution);
    conf.set("spatial-resolution", spatialResolution);
    conf.set("grid-resolution", gridResolution);
    conf.set("current-spatial-resolution", currentSpatialResolution);

    String[] indexes = cmd.getOptionValues("i");
    String temporalPos = "";
    Integer sizeSpatioTemp = 0;
    if (!(currentSpatialResolution.equals("points"))) {
        String spatialPos = "";
        for (int i = 0; i < indexes.length; i++) {
            temporalPos += indexes[i] + ",";
            spatialPos += indexes[++i] + ",";
            sizeSpatioTemp++;
        }
        conf.set("spatial-pos", spatialPos);
    } else {
        String xPositions = "", yPositions = "";
        for (int i = 0; i < indexes.length; i++) {
            temporalPos += indexes[i] + ",";
            xPositions += indexes[++i] + ",";
            yPositions += indexes[++i] + ",";
            sizeSpatioTemp++;
        }
        conf.set("xPositions", xPositions);
        conf.set("yPositions", yPositions);
    }
    conf.set("temporal-pos", temporalPos);

    conf.set("size-spatio-temporal", sizeSpatioTemp.toString());

    // checking resolutions

    if (utils.spatialResolution(spatialResolution) < 0) {
        System.out.println("Invalid spatial resolution: " + spatialResolution);
        System.exit(-1);
    }

    if (utils.spatialResolution(spatialResolution) == FrameworkUtils.POINTS) {
        System.out.println("The data needs to be reduced at least to neighborhoods or grid.");
        System.exit(-1);
    }

    if (utils.spatialResolution(currentSpatialResolution) < 0) {
        System.out.println("Invalid spatial resolution: " + currentSpatialResolution);
        System.exit(-1);
    }

    if (utils.spatialResolution(currentSpatialResolution) > utils.spatialResolution(spatialResolution)) {
        System.out.println("The current spatial resolution is coarser than "
                + "the desired one. You can only navigate from a fine resolution" + " to a coarser one.");
        System.exit(-1);
    }

    if (utils.temporalResolution(temporalResolution) < 0) {
        System.out.println("Invalid temporal resolution: " + temporalResolution);
        System.exit(-1);
    }

    String fileName = s3bucket + FrameworkUtils.preProcessingDir + "/" + dataset + "-" + temporalResolution
            + "-" + spatialResolution + gridResolution;
    conf.set("aggregates", fileName + ".aggregates");

    // making sure both files are removed, if they exist
    FrameworkUtils.removeFile(fileName, s3conf, s3);
    FrameworkUtils.removeFile(fileName + ".aggregates", s3conf, s3);

    /**
     * Hadoop Parameters
     * sources: http://www.slideshare.net/ImpetusInfo/ppt-on-advanced-hadoop-tuning-n-optimisation
     *          https://cloudcelebrity.wordpress.com/2013/08/14/12-key-steps-to-keep-your-hadoop-cluster-running-strong-and-performing-optimum/
     */

    conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    conf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    conf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    conf.set("mapreduce.task.io.sort.mb", "200");
    conf.set("mapreduce.task.io.sort.factor", "100");

    // using SnappyCodec for intermediate and output data ?
    // TODO: for now, using SnappyCodec -- what about LZO + Protocol Buffer serialization?
    //   LZO - http://www.oberhumer.com/opensource/lzo/#download
    //   Hadoop-LZO - https://github.com/twitter/hadoop-lzo
    //   Protocol Buffer - https://github.com/twitter/elephant-bird
    //   General Info - http://www.devx.com/Java/Article/47913
    //   Compression - http://comphadoop.weebly.com/index.html
    if (snappyCompression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        conf.set("mapreduce.output.fileoutputformat.compress.codec",
                "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        conf.set("mapreduce.output.fileoutputformat.compress.codec",
                "org.apache.hadoop.io.compress.BZip2Codec");
    }

    // TODO: this is dangerous!
    if (s3) {
        conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
    }

    Job job = new Job(conf);
    job.setJobName(dataset + "-" + temporalResolution + "-" + spatialResolution);

    job.setMapOutputKeyClass(MultipleSpatioTemporalWritable.class);
    job.setMapOutputValueClass(AggregationArrayWritable.class);

    job.setOutputKeyClass(MultipleSpatioTemporalWritable.class);
    job.setOutputValueClass(AggregationArrayWritable.class);

    job.setMapperClass(PreProcessingMapper.class);
    job.setCombinerClass(PreProcessingCombiner.class);
    job.setReducerClass(PreProcessingReducer.class);
    job.setNumReduceTasks(machineConf.getNumberReduces());
    //job.setNumReduceTasks(1);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    FileInputFormat.setInputPaths(job, new Path(s3bucket + FrameworkUtils.dataDir + "/" + dataset));
    FileOutputFormat.setOutputPath(job, new Path(fileName));

    job.setJarByClass(PreProcessing.class);

    long start = System.currentTimeMillis();
    job.submit();
    job.waitForCompletion(true);
    System.out.println(fileName + "\t" + (System.currentTimeMillis() - start));

}

From source file:edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques.java

/**
 * @param args/*w ww  .j  a v  a 2  s. co m*/
 * @throws ParseException 
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the relationship " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option g1Option = new Option("g1", "first-group", true, "set first group of datasets");
    g1Option.setRequired(true);
    g1Option.setArgName("FIRST GROUP");
    g1Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g1Option);

    Option g2Option = new Option("g2", "second-group", true, "set second group of datasets");
    g2Option.setRequired(false);
    g2Option.setArgName("SECOND GROUP");
    g2Option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(g2Option);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar "
                        + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques",
                options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar "
                        + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques",
                options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.standard_techniques.CorrelationTechniques",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> firstGroup = new ArrayList<String>();
    ArrayList<String> secondGroup = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();

    boolean removeExistingFiles = cmd.hasOption("f");

    String[] firstGroupCmd = cmd.getOptionValues("g1");
    String[] secondGroupCmd = cmd.hasOption("g2") ? cmd.getOptionValues("g2") : new String[0];
    addDatasets(firstGroupCmd, firstGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);
    addDatasets(secondGroupCmd, secondGroup, shortDataset, datasetAgg, path, fs, s3conf, s3, s3bucket);

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    if (firstGroup.isEmpty()) {
        System.out.println("First group of datasets (G1) is empty. " + "Doing G1 = G2.");
        firstGroup.addAll(secondGroup);
    }

    if (secondGroup.isEmpty()) {
        System.out.println("Second group of datasets (G2) is empty. " + "Doing G2 = G1.");
        secondGroup.addAll(firstGroup);
    }

    // getting dataset ids

    String datasetNames = "";
    String datasetIds = "";
    HashMap<String, String> datasetId = new HashMap<String, String>();
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        datasetId.put(it.next(), null);
    }

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    String firstGroupStr = "";
    String secondGroupStr = "";
    for (String dataset : firstGroup) {
        firstGroupStr += datasetId.get(dataset) + ",";
    }
    for (String dataset : secondGroup) {
        secondGroupStr += datasetId.get(dataset) + ",";
    }
    firstGroupStr = firstGroupStr.substring(0, firstGroupStr.length() - 1);
    secondGroupStr = secondGroupStr.substring(0, secondGroupStr.length() - 1);

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.correlationTechniquesDir, s3conf, s3);

    String dataAttributesInputDirs = "";
    String noRelationship = "";

    HashSet<String> dirs = new HashSet<String>();

    String dataset1;
    String dataset2;
    String datasetId1;
    String datasetId2;
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            datasetId1 = datasetId.get(dataset1);
            datasetId2 = datasetId.get(dataset2);

            if (dataset1.equals(dataset2))
                continue;
            String correlationOutputFileName = s3bucket + FrameworkUtils.correlationTechniquesDir + "/"
                    + dataset1 + "-" + dataset2 + "/";

            if (removeExistingFiles) {
                FrameworkUtils.removeFile(correlationOutputFileName, s3conf, s3);
            }
            if (!FrameworkUtils.fileExists(correlationOutputFileName, s3conf, s3)) {
                dirs.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset1);
                dirs.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset2);
            } else {
                noRelationship += datasetId1 + "-" + datasetId2 + ",";
            }
        }
    }

    if (dirs.isEmpty()) {
        System.out.println("All the relationships were already computed.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    for (String dir : dirs) {
        dataAttributesInputDirs += dir + ",";
    }

    Configuration conf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "correlation";
    String correlationOutputDir = s3bucket + FrameworkUtils.correlationTechniquesDir + "/tmp/";

    FrameworkUtils.removeFile(correlationOutputDir, s3conf, s3);

    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg", datasetAgg.get(shortDataset.get(i)));
    }
    for (int i = 0; i < shortDataset.size(); i++) {
        conf.set("dataset-" + datasetId.get(shortDataset.get(i)) + "-agg-size",
                Integer.toString(datasetAgg.get(shortDataset.get(i)).split(",").length));
    }
    conf.set("dataset-keys", datasetIds);
    conf.set("dataset-names", datasetNames);
    conf.set("first-group", firstGroupStr);
    conf.set("second-group", secondGroupStr);
    conf.set("main-dataset-id", datasetId.get(shortDataset.get(0)));
    if (noRelationship.length() > 0) {
        conf.set("no-relationship", noRelationship.substring(0, noRelationship.length() - 1));
    }

    conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    conf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    conf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    conf.set("mapreduce.task.io.sort.mb", "200");
    conf.set("mapreduce.task.io.sort.factor", "100");
    conf.set("mapreduce.task.timeout", "2400000");

    if (s3) {
        machineConf.setMachineConfiguration(conf);
        conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        conf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    Job job = new Job(conf);
    job.setJobName(jobName);

    job.setMapOutputKeyClass(PairAttributeWritable.class);
    job.setMapOutputValueClass(SpatioTemporalValueWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(CorrelationTechniquesMapper.class);
    job.setReducerClass(CorrelationTechniquesReducer.class);
    job.setNumReduceTasks(machineConf.getNumberReduces());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    FileInputFormat.setInputDirRecursive(job, true);
    FileInputFormat.setInputPaths(job,
            dataAttributesInputDirs.substring(0, dataAttributesInputDirs.length() - 1));
    FileOutputFormat.setOutputPath(job, new Path(correlationOutputDir));

    job.setJarByClass(CorrelationTechniques.class);

    long start = System.currentTimeMillis();
    job.submit();
    job.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (int i = 0; i < firstGroup.size(); i++) {
        for (int j = 0; j < secondGroup.size(); j++) {

            if (Integer.parseInt(datasetId.get(firstGroup.get(i))) < Integer
                    .parseInt(datasetId.get(secondGroup.get(j)))) {
                dataset1 = firstGroup.get(i);
                dataset2 = secondGroup.get(j);
            } else {
                dataset1 = secondGroup.get(j);
                dataset2 = firstGroup.get(i);
            }

            if (dataset1.equals(dataset2))
                continue;

            String from = s3bucket + FrameworkUtils.correlationTechniquesDir + "/tmp/" + dataset1 + "-"
                    + dataset2 + "/";
            String to = s3bucket + FrameworkUtils.correlationTechniquesDir + "/" + dataset1 + "-" + dataset2
                    + "/";
            FrameworkUtils.renameFile(from, to, s3conf, s3);
        }
    }
}

From source file:edu.nyu.vida.data_polygamy.feature_identification.IndexCreation.java

/**
 * @param args/*from  w  w  w . j  a va  2 s  . c o m*/
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the index and events " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option thresholdOption = new Option("t", "use-custom-thresholds", false,
            "use custom thresholds for regular and rare events, defined in HDFS_HOME/"
                    + FrameworkUtils.thresholdDir + " file");
    thresholdOption.setRequired(false);
    options.addOption(thresholdOption);

    Option gOption = new Option("g", "group", true,
            "set group of datasets for which the indices and events" + " will be computed");
    gOption.setRequired(true);
    gOption.setArgName("GROUP");
    gOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(gOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp("hadoop jar data-polygamy.jar "
                    + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    String datasetNames = "";
    String datasetIds = "";

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> shortDatasetIndex = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();
    HashMap<String, String> datasetId = new HashMap<String, String>();
    HashMap<String, HashMap<Integer, Double>> datasetRegThreshold = new HashMap<String, HashMap<Integer, Double>>();
    HashMap<String, HashMap<Integer, Double>> datasetRareThreshold = new HashMap<String, HashMap<Integer, Double>>();

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());
    BufferedReader br;

    boolean removeExistingFiles = cmd.hasOption("f");
    boolean isThresholdUserDefined = cmd.hasOption("t");

    for (String dataset : cmd.getOptionValues("g")) {

        // getting aggregates
        String[] aggregate = FrameworkUtils.searchAggregates(dataset, s3conf, s3);
        if (aggregate.length == 0) {
            System.out.println("No aggregates found for " + dataset + ".");
            continue;
        }

        // getting aggregates header
        String aggregatesHeaderFileName = FrameworkUtils.searchAggregatesHeader(dataset, s3conf, s3);
        if (aggregatesHeaderFileName == null) {
            System.out.println("No aggregate header for " + dataset);
            continue;
        }

        String aggregatesHeader = s3bucket + FrameworkUtils.preProcessingDir + "/" + aggregatesHeaderFileName;

        shortDataset.add(dataset);
        datasetId.put(dataset, null);

        if (s3) {
            path = new Path(aggregatesHeader);
            fs = FileSystem.get(path.toUri(), s3conf);
        } else {
            path = new Path(fs.getHomeDirectory() + "/" + aggregatesHeader);
        }

        br = new BufferedReader(new InputStreamReader(fs.open(path)));
        datasetAgg.put(dataset, br.readLine().split("\t")[1]);
        br.close();
        if (s3)
            fs.close();
    }

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    // getting dataset id

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    // getting user defined thresholds

    if (isThresholdUserDefined) {
        if (s3) {
            path = new Path(s3bucket + FrameworkUtils.thresholdDir);
            fs = FileSystem.get(path.toUri(), s3conf);
        } else {
            path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.thresholdDir);
        }
        br = new BufferedReader(new InputStreamReader(fs.open(path)));
        line = br.readLine();
        while (line != null) {
            // getting dataset name
            String dataset = line.trim();
            HashMap<Integer, Double> regThresholds = new HashMap<Integer, Double>();
            HashMap<Integer, Double> rareThresholds = new HashMap<Integer, Double>();
            line = br.readLine();
            while ((line != null) && (line.split("\t").length > 1)) {
                // getting attribute ids and thresholds
                String[] keyVals = line.trim().split("\t");
                int att = Integer.parseInt(keyVals[0].trim());
                regThresholds.put(att, Double.parseDouble(keyVals[1].trim()));
                rareThresholds.put(att, Double.parseDouble(keyVals[2].trim()));
                line = br.readLine();
            }
            datasetRegThreshold.put(dataset, regThresholds);
            datasetRareThreshold.put(dataset, rareThresholds);
        }
        br.close();
    }
    if (s3)
        fs.close();

    // datasets that will use existing merge tree
    ArrayList<String> useMergeTree = new ArrayList<String>();

    // creating index for each spatio-temporal resolution

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.indexDir, s3conf, s3);

    HashSet<String> input = new HashSet<String>();

    for (String dataset : shortDataset) {

        String indexCreationOutputFileName = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/";
        String mergeTreeFileName = s3bucket + FrameworkUtils.mergeTreeDir + "/" + dataset + "/";

        if (removeExistingFiles) {
            FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3);
            FrameworkUtils.removeFile(mergeTreeFileName, s3conf, s3);
            FrameworkUtils.createDir(mergeTreeFileName, s3conf, s3);
        } else if (datasetRegThreshold.containsKey(dataset)) {
            FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3);
            if (FrameworkUtils.fileExists(mergeTreeFileName, s3conf, s3)) {
                useMergeTree.add(dataset);
            }
        }

        if (!FrameworkUtils.fileExists(indexCreationOutputFileName, s3conf, s3)) {
            input.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset);
            shortDatasetIndex.add(dataset);
        }

    }

    if (input.isEmpty()) {
        System.out.println("All the input datasets have indices.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    String aggregateDatasets = "";
    it = input.iterator();
    while (it.hasNext()) {
        aggregateDatasets += it.next() + ",";
    }

    Job icJob = null;
    Configuration icConf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "index";
    String indexOutputDir = s3bucket + FrameworkUtils.indexDir + "/tmp/";

    FrameworkUtils.removeFile(indexOutputDir, s3conf, s3);

    icConf.set("dataset-name", datasetNames);
    icConf.set("dataset-id", datasetIds);

    if (!useMergeTree.isEmpty()) {
        String useMergeTreeStr = "";
        for (String dt : useMergeTree) {
            useMergeTreeStr += dt + ",";
        }
        icConf.set("use-merge-tree", useMergeTreeStr.substring(0, useMergeTreeStr.length() - 1));
    }

    for (int i = 0; i < shortDataset.size(); i++) {
        String dataset = shortDataset.get(i);
        String id = datasetId.get(dataset);
        icConf.set("dataset-" + id + "-aggregates", datasetAgg.get(dataset));
        if (datasetRegThreshold.containsKey(dataset)) {
            HashMap<Integer, Double> regThresholds = datasetRegThreshold.get(dataset);
            String thresholds = "";
            for (int att : regThresholds.keySet()) {
                thresholds += String.valueOf(att) + "-" + String.valueOf(regThresholds.get(att)) + ",";
            }
            icConf.set("regular-" + id, thresholds.substring(0, thresholds.length() - 1));
        }

        if (datasetRareThreshold.containsKey(dataset)) {
            HashMap<Integer, Double> rareThresholds = datasetRareThreshold.get(dataset);
            String thresholds = "";
            for (int att : rareThresholds.keySet()) {
                thresholds += String.valueOf(att) + "-" + String.valueOf(rareThresholds.get(att)) + ",";
            }
            icConf.set("rare-" + id, thresholds.substring(0, thresholds.length() - 1));
        }
    }

    icConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    icConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    icConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    icConf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    icConf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    icConf.set("mapreduce.task.io.sort.mb", "200");
    icConf.set("mapreduce.task.io.sort.factor", "100");
    //icConf.set("mapreduce.task.timeout", "1800000");
    machineConf.setMachineConfiguration(icConf);

    if (s3) {
        machineConf.setMachineConfiguration(icConf);
        icConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        icConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        icConf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        icConf.set("mapreduce.map.output.compress", "true");
        icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        icConf.set("mapreduce.map.output.compress", "true");
        icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    icJob = new Job(icConf);
    icJob.setJobName(jobName);

    icJob.setMapOutputKeyClass(AttributeResolutionWritable.class);
    icJob.setMapOutputValueClass(SpatioTemporalFloatWritable.class);
    icJob.setOutputKeyClass(AttributeResolutionWritable.class);
    icJob.setOutputValueClass(TopologyTimeSeriesWritable.class);
    //icJob.setOutputKeyClass(Text.class);
    //icJob.setOutputValueClass(Text.class);

    icJob.setMapperClass(IndexCreationMapper.class);
    icJob.setReducerClass(IndexCreationReducer.class);
    icJob.setNumReduceTasks(machineConf.getNumberReduces());

    icJob.setInputFormatClass(SequenceFileInputFormat.class);
    //icJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(icJob, SequenceFileOutputFormat.class);
    //LazyOutputFormat.setOutputFormatClass(icJob, TextOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(icJob, true);
    SequenceFileOutputFormat.setOutputCompressionType(icJob, CompressionType.BLOCK);

    FileInputFormat.setInputDirRecursive(icJob, true);
    FileInputFormat.setInputPaths(icJob, aggregateDatasets.substring(0, aggregateDatasets.length() - 1));
    FileOutputFormat.setOutputPath(icJob, new Path(indexOutputDir));

    icJob.setJarByClass(IndexCreation.class);

    long start = System.currentTimeMillis();
    icJob.submit();
    icJob.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (String dataset : shortDatasetIndex) {
        String from = s3bucket + FrameworkUtils.indexDir + "/tmp/" + dataset + "/";
        String to = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/";
        FrameworkUtils.renameFile(from, to, s3conf, s3);
    }

}

From source file:edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation.java

/**
 * @param args//from   w  w  w .  j  av  a 2 s .c o m
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the aggregate functions " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option gOption = new Option("g", "group", true, "set group of datasets for which the aggregate functions"
            + " will be computed, followed by their temporal and spatial attribute indices");
    gOption.setRequired(true);
    gOption.setArgName("GROUP");
    gOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(gOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    String datasetNames = "";
    String datasetIds = "";
    String preProcessingDatasets = "";

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> shortDatasetAggregation = new ArrayList<String>();
    HashMap<String, String> datasetTempAtt = new HashMap<String, String>();
    HashMap<String, String> datasetSpatialAtt = new HashMap<String, String>();
    HashMap<String, String> preProcessingDataset = new HashMap<String, String>();
    HashMap<String, String> datasetId = new HashMap<String, String>();

    boolean removeExistingFiles = cmd.hasOption("f");
    String[] datasetArgs = cmd.getOptionValues("g");

    for (int i = 0; i < datasetArgs.length; i += 3) {
        String dataset = datasetArgs[i];

        // getting pre-processing
        String tempPreProcessing = FrameworkUtils.searchPreProcessing(dataset, s3conf, s3);
        if (tempPreProcessing == null) {
            System.out.println("No pre-processing available for " + dataset);
            continue;
        }
        preProcessingDataset.put(dataset, tempPreProcessing);

        shortDataset.add(dataset);
        datasetTempAtt.put(dataset, ((datasetArgs[i + 1] == "null") ? null : datasetArgs[i + 1]));
        datasetSpatialAtt.put(dataset, ((datasetArgs[i + 2] == "null") ? null : datasetArgs[i + 2]));

        datasetId.put(dataset, null);
    }

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    // getting dataset id

    Path path = null;
    FileSystem fs = null;

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        fs = FileSystem.get(new Configuration());
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.aggregatesDir, s3conf, s3);

    // getting smallest resolution

    HashMap<String, String> tempResMap = new HashMap<String, String>();
    HashMap<String, String> spatialResMap = new HashMap<String, String>();

    HashMap<String, String> datasetTemporalStrMap = new HashMap<String, String>();
    HashMap<String, String> datasetSpatialStrMap = new HashMap<String, String>();

    HashSet<String> input = new HashSet<String>();

    for (String dataset : shortDataset) {

        String[] datasetArray = preProcessingDataset.get(dataset).split("-");

        String datasetTemporalStr = datasetArray[datasetArray.length - 2];
        int datasetTemporal = utils.temporalResolution(datasetTemporalStr);

        String datasetSpatialStr = datasetArray[datasetArray.length - 1];
        int datasetSpatial = utils.spatialResolution(datasetSpatialStr);

        // finding all possible resolutions

        String[] temporalResolutions = FrameworkUtils.getAggTempResolutions(datasetTemporal);
        String[] spatialResolutions = FrameworkUtils.getAggSpatialResolutions(datasetSpatial);

        String temporalResolution = "";
        String spatialResolution = "";

        String tempRes = "";
        String spatialRes = "";

        boolean dataAdded = false;

        for (int i = 0; i < temporalResolutions.length; i++) {
            for (int j = 0; j < spatialResolutions.length; j++) {

                temporalResolution = temporalResolutions[i];
                spatialResolution = spatialResolutions[j];

                String aggregatesOutputFileName = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/";

                if (removeExistingFiles) {
                    FrameworkUtils.removeFile(aggregatesOutputFileName, s3conf, s3);
                }

                if (!FrameworkUtils.fileExists(aggregatesOutputFileName, s3conf, s3)) {

                    dataAdded = true;

                    tempRes += temporalResolution + "-";
                    spatialRes += spatialResolution + "-";
                }
            }
        }

        if (dataAdded) {
            input.add(s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
            shortDatasetAggregation.add(dataset);

            tempResMap.put(dataset, tempRes.substring(0, tempRes.length() - 1));
            spatialResMap.put(dataset, spatialRes.substring(0, spatialRes.length() - 1));

            datasetTemporalStrMap.put(dataset, datasetTemporalStr);
            datasetSpatialStrMap.put(dataset, datasetSpatialStr);
        }
    }

    if (input.isEmpty()) {
        System.out.println("All the input datasets have aggregates.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    it = input.iterator();
    while (it.hasNext()) {
        preProcessingDatasets += it.next() + ",";
    }

    Job aggJob = null;
    String aggregatesOutputDir = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/";
    String jobName = "aggregates";

    FrameworkUtils.removeFile(aggregatesOutputDir, s3conf, s3);

    Configuration aggConf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    aggConf.set("dataset-name", datasetNames);
    aggConf.set("dataset-id", datasetIds);

    for (int i = 0; i < shortDatasetAggregation.size(); i++) {
        String dataset = shortDatasetAggregation.get(i);
        String id = datasetId.get(dataset);
        aggConf.set("dataset-" + id + "-temporal-resolutions", tempResMap.get(dataset));
        aggConf.set("dataset-" + id + "-spatial-resolutions", spatialResMap.get(dataset));
        aggConf.set("dataset-" + id + "-temporal-att", datasetTempAtt.get(dataset));
        aggConf.set("dataset-" + id + "-spatial-att", datasetSpatialAtt.get(dataset));
        aggConf.set("dataset-" + id + "-temporal", datasetTemporalStrMap.get(dataset));
        aggConf.set("dataset-" + id + "-spatial", datasetSpatialStrMap.get(dataset));

        if (s3)
            aggConf.set("dataset-" + id,
                    s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
        else
            aggConf.set("dataset-" + id, FileSystem.get(new Configuration()).getHomeDirectory() + "/"
                    + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
    }

    aggConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    aggConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    aggConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    aggConf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    aggConf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    aggConf.set("mapreduce.task.io.sort.mb", "200");
    aggConf.set("mapreduce.task.io.sort.factor", "100");
    machineConf.setMachineConfiguration(aggConf);

    if (s3) {
        machineConf.setMachineConfiguration(aggConf);
        aggConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        aggConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
    }

    if (snappyCompression) {
        aggConf.set("mapreduce.map.output.compress", "true");
        aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        aggConf.set("mapreduce.map.output.compress", "true");
        aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    aggJob = new Job(aggConf);
    aggJob.setJobName(jobName);

    aggJob.setMapOutputKeyClass(SpatioTemporalWritable.class);
    aggJob.setMapOutputValueClass(AggregationArrayWritable.class);
    aggJob.setOutputKeyClass(SpatioTemporalWritable.class);
    aggJob.setOutputValueClass(FloatArrayWritable.class);
    //aggJob.setOutputKeyClass(Text.class);
    //aggJob.setOutputValueClass(Text.class);

    aggJob.setMapperClass(AggregationMapper.class);
    aggJob.setCombinerClass(AggregationCombiner.class);
    aggJob.setReducerClass(AggregationReducer.class);
    aggJob.setNumReduceTasks(machineConf.getNumberReduces());

    aggJob.setInputFormatClass(SequenceFileInputFormat.class);
    //aggJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(aggJob, SequenceFileOutputFormat.class);
    //LazyOutputFormat.setOutputFormatClass(aggJob, TextOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(aggJob, true);
    SequenceFileOutputFormat.setOutputCompressionType(aggJob, CompressionType.BLOCK);

    FileInputFormat.setInputDirRecursive(aggJob, true);
    FileInputFormat.setInputPaths(aggJob,
            preProcessingDatasets.substring(0, preProcessingDatasets.length() - 1));
    FileOutputFormat.setOutputPath(aggJob, new Path(aggregatesOutputDir));

    aggJob.setJarByClass(Aggregation.class);

    long start = System.currentTimeMillis();
    aggJob.submit();
    aggJob.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (String dataset : shortDatasetAggregation) {
        String from = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/" + dataset + "/";
        String to = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/";
        FrameworkUtils.renameFile(from, to, s3conf, s3);
    }

}

From source file:tuit.java

@SuppressWarnings("ConstantConditions")
public static void main(String[] args) {
    System.out.println(licence);/*w ww .  j ava  2 s . com*/
    //Declare variables
    File inputFile;
    File outputFile;
    File tmpDir;
    File blastnExecutable;
    File properties;
    File blastOutputFile = null;
    //
    TUITPropertiesLoader tuitPropertiesLoader;
    TUITProperties tuitProperties;
    //
    String[] parameters = null;
    //
    Connection connection = null;
    MySQL_Connector mySQL_connector;
    //
    Map<Ranks, TUITCutoffSet> cutoffMap;
    //
    BLASTIdentifier blastIdentifier = null;
    //
    RamDb ramDb = null;

    CommandLineParser parser = new GnuParser();
    Options options = new Options();

    options.addOption(tuit.IN, "input<file>", true, "Input file (currently fasta-formatted only)");
    options.addOption(tuit.OUT, "output<file>", true, "Output file (in " + tuit.TUIT_EXT + " format)");
    options.addOption(tuit.P, "prop<file>", true, "Properties file (XML formatted)");
    options.addOption(tuit.V, "verbose", false, "Enable verbose output");
    options.addOption(tuit.B, "blast_output<file>", true, "Perform on a pre-BLASTed output");
    options.addOption(tuit.DEPLOY, "deploy", false, "Deploy the taxonomic databases");
    options.addOption(tuit.UPDATE, "update", false, "Update the taxonomic databases");
    options.addOption(tuit.USE_DB, "usedb", false, "Use RDBMS instead of RAM-based taxonomy");

    Option option = new Option(tuit.REDUCE, "reduce", true,
            "Pack identical (100% similar sequences) records in the given sample file");
    option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(option);
    option = new Option(tuit.COMBINE, "combine", true,
            "Combine a set of given reduction files into an HMP Tree-compatible taxonomy");
    option.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(option);
    options.addOption(tuit.NORMALIZE, "normalize", false,
            "If used in combination with -combine ensures that the values are normalized by the root value");

    HelpFormatter formatter = new HelpFormatter();

    try {

        //Get TUIT directory
        final File tuitDir = new File(
                new File(tuit.class.getProtectionDomain().getCodeSource().getLocation().toURI().getPath())
                        .getParent());
        final File ramDbFile = new File(tuitDir, tuit.RAM_DB);

        //Setup logger
        Log.getInstance().setLogName("tuit.log");

        //Read command line
        final CommandLine commandLine = parser.parse(options, args, true);

        //Check if the REDUCE option is on
        if (commandLine.hasOption(tuit.REDUCE)) {

            final String[] fileList = commandLine.getOptionValues(tuit.REDUCE);
            for (String s : fileList) {
                final Path path = Paths.get(s);
                Log.getInstance().log(Level.INFO, "Processing " + path.toString() + "...");
                final NucleotideFastaSequenceReductor nucleotideFastaSequenceReductor = NucleotideFastaSequenceReductor
                        .fromPath(path);
                ReductorFileOperator.save(nucleotideFastaSequenceReductor,
                        path.resolveSibling(path.getFileName().toString() + ".rdc"));
            }

            Log.getInstance().log(Level.FINE, "Task done, exiting...");
            return;
        }

        //Check if COMBINE is on
        if (commandLine.hasOption(tuit.COMBINE)) {
            final boolean normalize = commandLine.hasOption(tuit.NORMALIZE);
            final String[] fileList = commandLine.getOptionValues(tuit.COMBINE);
            //TODO: implement a test for format here

            final List<TreeFormatter.TreeFormatterFormat.HMPTreesOutput> hmpTreesOutputs = new ArrayList<>();
            final TreeFormatter treeFormatter = TreeFormatter
                    .newInstance(new TreeFormatter.TuitLineTreeFormatterFormat());
            for (String s : fileList) {
                final Path path = Paths.get(s);
                Log.getInstance().log(Level.INFO, "Merging " + path.toString() + "...");
                treeFormatter.loadFromPath(path);
                final TreeFormatter.TreeFormatterFormat.HMPTreesOutput output = TreeFormatter.TreeFormatterFormat.HMPTreesOutput
                        .newInstance(treeFormatter.toHMPTree(normalize), s.substring(0, s.indexOf(".")));
                hmpTreesOutputs.add(output);
                treeFormatter.erase();
            }
            final Path destination;
            if (commandLine.hasOption(OUT)) {
                destination = Paths.get(commandLine.getOptionValue(tuit.OUT));
            } else {
                destination = Paths.get("merge.tcf");
            }
            CombinatorFileOperator.save(hmpTreesOutputs, treeFormatter, destination);
            Log.getInstance().log(Level.FINE, "Task done, exiting...");
            return;
        }

        if (!commandLine.hasOption(tuit.P)) {
            throw new ParseException("No properties file option found, exiting.");
        } else {
            properties = new File(commandLine.getOptionValue(tuit.P));
        }

        //Load properties
        tuitPropertiesLoader = TUITPropertiesLoader.newInstanceFromFile(properties);
        tuitProperties = tuitPropertiesLoader.getTuitProperties();

        //Create tmp directory and blastn executable
        tmpDir = new File(tuitProperties.getTMPDir().getPath());
        blastnExecutable = new File(tuitProperties.getBLASTNPath().getPath());

        //Check for deploy
        if (commandLine.hasOption(tuit.DEPLOY)) {
            if (commandLine.hasOption(tuit.USE_DB)) {
                NCBITablesDeployer.fastDeployNCBIDatabasesFromNCBI(connection, tmpDir);
            } else {
                NCBITablesDeployer.fastDeployNCBIRamDatabaseFromNCBI(tmpDir, ramDbFile);
            }

            Log.getInstance().log(Level.FINE, "Task done, exiting...");
            return;
        }
        //Check for update
        if (commandLine.hasOption(tuit.UPDATE)) {
            if (commandLine.hasOption(tuit.USE_DB)) {
                NCBITablesDeployer.updateDatabasesFromNCBI(connection, tmpDir);
            } else {
                //No need to specify a different way to update the database other than just deploy in case of the RAM database
                NCBITablesDeployer.fastDeployNCBIRamDatabaseFromNCBI(tmpDir, ramDbFile);
            }
            Log.getInstance().log(Level.FINE, "Task done, exiting...");
            return;
        }

        //Connect to the database
        if (commandLine.hasOption(tuit.USE_DB)) {
            mySQL_connector = MySQL_Connector.newDefaultInstance(
                    "jdbc:mysql://" + tuitProperties.getDBConnection().getUrl().trim() + "/",
                    tuitProperties.getDBConnection().getLogin().trim(),
                    tuitProperties.getDBConnection().getPassword().trim());
            mySQL_connector.connectToDatabase();
            connection = mySQL_connector.getConnection();
        } else {
            //Probe for ram database

            if (ramDbFile.exists() && ramDbFile.canRead()) {
                Log.getInstance().log(Level.INFO, "Loading RAM taxonomic map...");
                try {
                    ramDb = RamDb.loadSelfFromFile(ramDbFile);
                } catch (IOException ie) {
                    if (ie instanceof java.io.InvalidClassException)
                        throw new IOException("The RAM-based taxonomic database needs to be updated.");
                }

            } else {
                Log.getInstance().log(Level.SEVERE,
                        "The RAM database either has not been deployed, or is not accessible."
                                + "Please use the --deploy option and check permissions on the TUIT directory. "
                                + "If you were looking to use the RDBMS as a taxonomic reference, plese use the -usedb option.");
                return;
            }
        }

        if (commandLine.hasOption(tuit.B)) {
            blastOutputFile = new File(commandLine.getOptionValue(tuit.B));
            if (!blastOutputFile.exists() || !blastOutputFile.canRead()) {
                throw new Exception("BLAST output file either does not exist, or is not readable.");
            } else if (blastOutputFile.isDirectory()) {
                throw new Exception("BLAST output file points to a directory.");
            }
        }
        //Check vital parameters
        if (!commandLine.hasOption(tuit.IN)) {
            throw new ParseException("No input file option found, exiting.");
        } else {
            inputFile = new File(commandLine.getOptionValue(tuit.IN));
            Log.getInstance().setLogName(inputFile.getName().split("\\.")[0] + ".tuit.log");
        }
        //Correct the output file option if needed
        if (!commandLine.hasOption(tuit.OUT)) {
            outputFile = new File((inputFile.getPath()).split("\\.")[0] + tuit.TUIT_EXT);
        } else {
            outputFile = new File(commandLine.getOptionValue(tuit.OUT));
        }

        //Adjust the output level
        if (commandLine.hasOption(tuit.V)) {
            Log.getInstance().setLevel(Level.FINE);
            Log.getInstance().log(Level.INFO, "Using verbose output for the log");
        } else {
            Log.getInstance().setLevel(Level.INFO);
        }
        //Try all files
        if (inputFile != null) {
            if (!inputFile.exists() || !inputFile.canRead()) {
                throw new Exception("Input file either does not exist, or is not readable.");
            } else if (inputFile.isDirectory()) {
                throw new Exception("Input file points to a directory.");
            }
        }

        if (!properties.exists() || !properties.canRead()) {
            throw new Exception("Properties file either does not exist, or is not readable.");
        } else if (properties.isDirectory()) {
            throw new Exception("Properties file points to a directory.");
        }

        //Create blast parameters
        final StringBuilder stringBuilder = new StringBuilder();
        for (Database database : tuitProperties.getBLASTNParameters().getDatabase()) {
            stringBuilder.append(database.getUse());
            stringBuilder.append(" ");//Gonna insert an extra space for the last database
        }
        String remote;
        String entrez_query;
        if (tuitProperties.getBLASTNParameters().getRemote().getDelegate().equals("yes")) {
            remote = "-remote";
            entrez_query = "-entrez_query";
            parameters = new String[] { "-db", stringBuilder.toString(), remote, entrez_query,
                    tuitProperties.getBLASTNParameters().getEntrezQuery().getValue(), "-evalue",
                    tuitProperties.getBLASTNParameters().getExpect().getValue() };
        } else {
            if (!commandLine.hasOption(tuit.B)) {
                if (tuitProperties.getBLASTNParameters().getEntrezQuery().getValue().toUpperCase()
                        .startsWith("NOT")
                        || tuitProperties.getBLASTNParameters().getEntrezQuery().getValue().toUpperCase()
                                .startsWith("ALL")) {
                    parameters = new String[] { "-db", stringBuilder.toString(), "-evalue",
                            tuitProperties.getBLASTNParameters().getExpect().getValue(), "-negative_gilist",
                            TUITFileOperatorHelper.restrictToEntrez(tmpDir,
                                    tuitProperties.getBLASTNParameters().getEntrezQuery().getValue()
                                            .toUpperCase().replace("NOT", "OR"))
                                    .getAbsolutePath(),
                            "-num_threads", tuitProperties.getBLASTNParameters().getNumThreads().getValue() };
                } else if (tuitProperties.getBLASTNParameters().getEntrezQuery().getValue().toUpperCase()
                        .equals("")) {
                    parameters = new String[] { "-db", stringBuilder.toString(), "-evalue",
                            tuitProperties.getBLASTNParameters().getExpect().getValue(), "-num_threads",
                            tuitProperties.getBLASTNParameters().getNumThreads().getValue() };
                } else {
                    parameters = new String[] { "-db", stringBuilder.toString(), "-evalue",
                            tuitProperties.getBLASTNParameters().getExpect().getValue(),
                            /*"-gilist", TUITFileOperatorHelper.restrictToEntrez(
                            tmpDir, tuitProperties.getBLASTNParameters().getEntrezQuery().getValue()).getAbsolutePath(),*/ //TODO remove comment!!!!!
                            "-num_threads", tuitProperties.getBLASTNParameters().getNumThreads().getValue() };
                }
            }
        }
        //Prepare a cutoff Map
        if (tuitProperties.getSpecificationParameters() != null
                && tuitProperties.getSpecificationParameters().size() > 0) {
            cutoffMap = new HashMap<Ranks, TUITCutoffSet>(tuitProperties.getSpecificationParameters().size());
            for (SpecificationParameters specificationParameters : tuitProperties
                    .getSpecificationParameters()) {
                cutoffMap.put(Ranks.valueOf(specificationParameters.getCutoffSet().getRank()),
                        TUITCutoffSet.newDefaultInstance(
                                Double.parseDouble(
                                        specificationParameters.getCutoffSet().getPIdentCutoff().getValue()),
                                Double.parseDouble(specificationParameters.getCutoffSet()
                                        .getQueryCoverageCutoff().getValue()),
                                Double.parseDouble(
                                        specificationParameters.getCutoffSet().getAlpha().getValue())));
            }
        } else {
            cutoffMap = new HashMap<Ranks, TUITCutoffSet>();
        }
        final TUITFileOperatorHelper.OutputFormat format;
        if (tuitProperties.getBLASTNParameters().getOutputFormat().getFormat().equals("rdp")) {
            format = TUITFileOperatorHelper.OutputFormat.RDP_FIXRANK;
        } else {
            format = TUITFileOperatorHelper.OutputFormat.TUIT;
        }

        try (TUITFileOperator<NucleotideFasta> nucleotideFastaTUITFileOperator = NucleotideFastaTUITFileOperator
                .newInstance(format, cutoffMap);) {
            nucleotideFastaTUITFileOperator.setInputFile(inputFile);
            nucleotideFastaTUITFileOperator.setOutputFile(outputFile);
            final String cleanupString = tuitProperties.getBLASTNParameters().getKeepBLASTOuts().getKeep();
            final boolean cleanup;
            if (cleanupString.equals("no")) {
                Log.getInstance().log(Level.INFO, "Temporary BLAST files will be deleted.");
                cleanup = true;
            } else {
                Log.getInstance().log(Level.INFO, "Temporary BLAST files will be kept.");
                cleanup = false;
            }
            //Create blast identifier
            ExecutorService executorService = Executors.newSingleThreadExecutor();
            if (commandLine.hasOption(tuit.USE_DB)) {

                if (blastOutputFile == null) {
                    blastIdentifier = TUITBLASTIdentifierDB.newInstanceFromFileOperator(tmpDir,
                            blastnExecutable, parameters, nucleotideFastaTUITFileOperator, connection,
                            cutoffMap,
                            Integer.parseInt(
                                    tuitProperties.getBLASTNParameters().getMaxFilesInBatch().getValue()),
                            cleanup);

                } else {
                    try {
                        blastIdentifier = TUITBLASTIdentifierDB.newInstanceFromBLASTOutput(
                                nucleotideFastaTUITFileOperator, connection, cutoffMap, blastOutputFile,
                                Integer.parseInt(
                                        tuitProperties.getBLASTNParameters().getMaxFilesInBatch().getValue()),
                                cleanup);

                    } catch (JAXBException e) {
                        Log.getInstance().log(Level.SEVERE, "Error reading " + blastOutputFile.getName()
                                + ", please check input. The file must be XML formatted.");
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }

            } else {
                if (blastOutputFile == null) {
                    blastIdentifier = TUITBLASTIdentifierRAM.newInstanceFromFileOperator(tmpDir,
                            blastnExecutable, parameters, nucleotideFastaTUITFileOperator, cutoffMap,
                            Integer.parseInt(
                                    tuitProperties.getBLASTNParameters().getMaxFilesInBatch().getValue()),
                            cleanup, ramDb);

                } else {
                    try {
                        blastIdentifier = TUITBLASTIdentifierRAM.newInstanceFromBLASTOutput(
                                nucleotideFastaTUITFileOperator, cutoffMap, blastOutputFile,
                                Integer.parseInt(
                                        tuitProperties.getBLASTNParameters().getMaxFilesInBatch().getValue()),
                                cleanup, ramDb);

                    } catch (JAXBException e) {
                        Log.getInstance().log(Level.SEVERE, "Error reading " + blastOutputFile.getName()
                                + ", please check input. The file must be XML formatted.");
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            }
            Future<?> runnableFuture = executorService.submit(blastIdentifier);
            runnableFuture.get();
            executorService.shutdown();
        }
    } catch (ParseException pe) {
        Log.getInstance().log(Level.SEVERE, (pe.getMessage()));
        formatter.printHelp("tuit", options);
    } catch (SAXException saxe) {
        Log.getInstance().log(Level.SEVERE, saxe.getMessage());
    } catch (FileNotFoundException fnfe) {
        Log.getInstance().log(Level.SEVERE, fnfe.getMessage());
    } catch (TUITPropertyBadFormatException tpbfe) {
        Log.getInstance().log(Level.SEVERE, tpbfe.getMessage());
    } catch (ClassCastException cce) {
        Log.getInstance().log(Level.SEVERE, cce.getMessage());
    } catch (JAXBException jaxbee) {
        Log.getInstance().log(Level.SEVERE,
                "The properties file is not well formatted. Please ensure that the XML is consistent with the io.properties.dtd schema.");
    } catch (ClassNotFoundException cnfe) {
        //Probably won't happen unless the library deleted from the .jar
        Log.getInstance().log(Level.SEVERE, cnfe.getMessage());
        //cnfe.printStackTrace();
    } catch (SQLException sqle) {
        Log.getInstance().log(Level.SEVERE,
                "A database communication error occurred with the following message:\n" + sqle.getMessage());
        //sqle.printStackTrace();
        if (sqle.getMessage().contains("Access denied for user")) {
            Log.getInstance().log(Level.SEVERE, "Please use standard database login: "
                    + NCBITablesDeployer.login + " and password: " + NCBITablesDeployer.password);
        }
    } catch (Exception e) {
        Log.getInstance().log(Level.SEVERE, e.getMessage());
        e.printStackTrace();
    } finally {
        if (connection != null) {
            try {
                connection.close();
            } catch (SQLException sqle) {
                Log.getInstance().log(Level.SEVERE, "Problem closing the database connection: " + sqle);
            }
        }
        Log.getInstance().log(Level.FINE, "Task done, exiting...");
    }
}