Example usage for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType

List of usage examples for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output SequenceFileOutputFormat setOutputCompressionType.

Prototype

public static void setOutputCompressionType(Job job, CompressionType style) 

Source Link

Document

Set the CompressionType for the output SequenceFile .

Usage

From source file:edu.isi.mavuno.app.nlp.TratzParse.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Parse.CorpusClass", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.Parse.OutputPath", conf);

    // optional parameter that allows the parsed documents to be output in text format
    String textOutput = MavunoUtils.getOptionalParam("Mavuno.Parse.TextOutputFormat", conf);
    boolean textOutputFormat = false;
    if (textOutput != null && Boolean.parseBoolean(textOutput)) {
        textOutputFormat = true;/*from  www.j  a v  a  2  s  . co m*/
    }

    sLogger.info("Tool name: TratzParse");
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("TratzParse");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));

    // output format -- either plain text or sequencefile (default)
    if (textOutputFormat) {
        job.setOutputFormatClass(TextOutputFormat.class);
    } else {
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
    }

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(TratzParsedDocument.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(TratzParsedDocument.class);

    job.setMapperClass(MyMapper.class);

    job.setJarByClass(TratzParse.class);

    // no reducers needed
    job.setNumReduceTasks(0);

    // run job
    job.waitForCompletion(true);

    // print job statistics
    Counters counters = job.getCounters();
    sLogger.info(" - Total documents: " + counters.findCounter(StatCounters.TOTAL_DOCUMENTS).getValue());
    sLogger.info(" - Total sentences: " + counters.findCounter(StatCounters.TOTAL_SENTENCES).getValue());
    sLogger.info(" - Total tokens: " + counters.findCounter(StatCounters.TOTAL_TOKENS).getValue());
    sLogger.info(" - Total dropped sentences: "
            + counters.findCounter(StatCounters.TOTAL_DROPPED_SENTENCES).getValue());
    sLogger.info(
            " - Total tokenization time (ms): " + counters.findCounter(StatCounters.TOKENIZE_TIME).getValue());
    sLogger.info(
            " - Total POS tagging time (ms): " + counters.findCounter(StatCounters.POSTAG_TIME).getValue());
    sLogger.info(" - Total chunking time (ms): " + counters.findCounter(StatCounters.CHUNK_TIME).getValue());
    sLogger.info(" - Total named entity tagging time (ms): "
            + counters.findCounter(StatCounters.NETAG_TIME).getValue());
    sLogger.info(" - Total parse time (ms): " + counters.findCounter(StatCounters.PARSE_TIME).getValue());

    return 0;
}

From source file:edu.isi.mavuno.extract.CombineGlobalStats.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.CombineGlobalStats.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineGlobalStats.OutputPath", conf);
    int numSplits = conf.getInt("Mavuno.CombineGlobalStats.TotalSplits", 1);

    sLogger.info("Tool name: CombineGlobalStats");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Number of splits: " + numSplits);

    Job job = new Job(conf);
    job.setJobName("CombineGlobalStats");

    for (int split = 0; split < numSplits; split++) {
        FileInputFormat.addInputPath(job, new Path(inputPath + "/" + split));
    }//from  w  w  w  . ja va 2  s  . c o m
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setMapOutputValueClass(ContextPatternStatsWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);

    return 0;
}

From source file:edu.isi.mavuno.extract.CombineSplits.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String examplesPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExamplesPath", conf);
    String exampleStatsPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.ExampleStatsPath", conf);
    String splitKey = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.SplitKey", conf).toLowerCase();
    int numSplits = conf.getInt("Mavuno.CombineSplits.TotalSplits", 1);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.CombineSplits.OutputPath", conf);

    sLogger.info("Tool name: CombineSplits");
    sLogger.info(" - Examples path: " + examplesPath);
    sLogger.info(" - Example stats path: " + exampleStatsPath);
    sLogger.info(" - Split key: " + splitKey);
    sLogger.info(" - Total splits: " + numSplits);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("CombineSplits");

    for (int split = 0; split < numSplits; split++) {
        FileInputFormat.addInputPath(job, new Path(examplesPath + "/" + split));
    }// w ww.  j  a v  a2  s  . c o  m

    if (MavunoUtils.pathExists(conf, exampleStatsPath)) {
        FileInputFormat.addInputPath(job, new Path(exampleStatsPath));
    }

    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);

    if ("pattern".equals(splitKey)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    } else if ("context".equals(splitKey)) {
        job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
    } else if ("pattern+context".equals(splitKey)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    } else {
        throw new RuntimeException("Invalid SplitKey in CombineSplits! -- " + splitKey);
    }

    job.setMapOutputValueClass(ContextPatternStatsWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.isi.mavuno.extract.Extract.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.InputPath", conf);
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.Extract.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorClass", conf);
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorArgs", conf);
    String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.Extract.ExtractorTarget", conf).toLowerCase();
    int minContextMatches = Integer.parseInt(MavunoUtils.getRequiredParam("Mavuno.Extract.MinMatches", conf));
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.Extract.OutputPath", conf);

    sLogger.info("Tool name: Extract");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Corpus path: " + corpusPath);
    sLogger.info(" - Corpus class: " + corpusClass);
    sLogger.info(" - Extractor class: " + extractorClass);
    sLogger.info(" - Extractor arguments: " + extractorArgs);
    sLogger.info(" - Extractor target: " + extractorTarget);
    sLogger.info(" - Min context matches: " + minContextMatches);
    sLogger.info(" - Output path: " + outputPath);

    Job job = new Job(conf);
    job.setJobName("Extract");

    MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    if ("pattern".equals(extractorTarget)) {
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
        job.setPartitionerClass(ContextPatternWritable.IdContextPartitioner.class);
    } else if ("context".equals(extractorTarget)) {
        job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
        job.setPartitionerClass(ContextPatternWritable.IdPatternPartitioner.class);
    } else {//w  w  w .  j  a  v  a2 s .c o m
        throw new RuntimeException("Invalid extractor target in Extract -- " + extractorTarget);
    }

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setMapOutputValueClass(ContextPatternStatsWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(ContextPatternStatsWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);
    return 0;
}

From source file:edu.isi.mavuno.extract.ExtractGlobalStats.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.InputPath", conf);
    String corpusPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusPath", conf);
    String corpusClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.CorpusClass", conf);
    String extractorClass = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorClass", conf);
    String extractorArgs = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorArgs", conf);
    String extractorTarget = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.ExtractorTarget", conf)
            .toLowerCase();//from   w  w w  . ja  v a2  s. co m
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ExtractGlobalStats.OutputPath", conf);

    // split examples
    conf.set("Mavuno.Split.InputPath", inputPath);
    conf.set("Mavuno.Split.OutputPath", outputPath + "/../split");
    conf.set("Mavuno.Split.SplitKey", extractorTarget);
    new Split(conf).run();

    // get splits
    FileStatus[] files = MavunoUtils.getDirectoryListing(conf, outputPath + "/../split");
    int split = 0;
    for (FileStatus file : files) {
        if (!file.getPath().getName().endsWith(".examples")) {
            continue;
        }

        conf.set("Mavuno.ExtractGlobalStats.ExamplesPath", file.getPath().toString());

        sLogger.info("Tool name: ExtractGlobalStats");
        sLogger.info(" - Input path: " + inputPath);
        sLogger.info(" - Examples path: " + file.getPath());
        sLogger.info(" - Example split: " + split);
        sLogger.info(" - Corpus path: " + corpusPath);
        sLogger.info(" - Corpus class: " + corpusClass);
        sLogger.info(" - Extractor class: " + extractorClass);
        sLogger.info(" - Extractor class: " + extractorArgs);
        sLogger.info(" - Extractor target: " + extractorTarget);
        sLogger.info(" - Output path: " + outputPath);

        Job job = new Job(conf);
        job.setJobName("ExtractGlobalStats");

        MavunoUtils.recursivelyAddInputPaths(job, corpusPath);
        FileOutputFormat.setOutputPath(job, new Path(outputPath + "/../split/" + split));

        job.setInputFormatClass((Class<? extends InputFormat>) Class.forName(corpusClass));
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

        job.setMapOutputKeyClass(ContextPatternWritable.class);
        job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
        job.setPartitionerClass(ContextPatternWritable.FullPartitioner.class);
        job.setMapOutputValueClass(ContextPatternStatsWritable.class);

        job.setOutputKeyClass(ContextPatternWritable.class);
        job.setOutputValueClass(ContextPatternStatsWritable.class);

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        job.waitForCompletion(true);

        split++;
    }

    // combine splits
    conf.setInt("Mavuno.CombineGlobalStats.TotalSplits", split);
    conf.set("Mavuno.CombineGlobalStats.InputPath", outputPath + "/../split/");
    conf.set("Mavuno.CombineGlobalStats.OutputPath", outputPath);
    new CombineGlobalStats(conf).run();

    MavunoUtils.removeDirectory(conf, outputPath + "/../split");

    return 0;
}

From source file:edu.isi.mavuno.score.ScoreContexts.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScoreContexts.OutputPath", conf);
    String contextScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf);
    String contextScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf);

    sLogger.info("Tool name: ScoreContexts");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Context scorer class: " + contextScorerClass);
    sLogger.info(" - Context scorer args: " + contextScorerArgs);

    Job job = new Job(conf);
    job.setJobName("ScoreContexts");

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setSortComparatorClass(ContextPatternWritable.Comparator.class);
    job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setMapOutputValueClass(ScoreWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);/*from  w  ww . j a  v  a 2  s . c om*/

    return 0;
}

From source file:edu.isi.mavuno.score.ScorePatterns.java

License:Apache License

public int run() throws ClassNotFoundException, InterruptedException, IOException {
    Configuration conf = getConf();

    String inputPath = MavunoUtils.getRequiredParam("Mavuno.ScorePatterns.InputPath", conf);
    String outputPath = MavunoUtils.getRequiredParam("Mavuno.ScorePatterns.OutputPath", conf);
    String patternScorerClass = MavunoUtils.getRequiredParam("Mavuno.Scorer.Class", conf);
    String patternScorerArgs = MavunoUtils.getRequiredParam("Mavuno.Scorer.Args", conf);

    sLogger.info("Tool name: ScorePatterns");
    sLogger.info(" - Input path: " + inputPath);
    sLogger.info(" - Output path: " + outputPath);
    sLogger.info(" - Pattern scorer class: " + patternScorerClass);
    sLogger.info(" - Pattern scorer args: " + patternScorerArgs);

    Job job = new Job(conf);
    job.setJobName("ScorePatterns");

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setSortComparatorClass(ContextPatternWritable.IdPatternComparator.class);
    job.setPartitionerClass(ContextPatternWritable.IdPartitioner.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    FileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    job.setMapOutputKeyClass(ContextPatternWritable.class);
    job.setMapOutputValueClass(ScoreWritable.class);

    job.setOutputKeyClass(ContextPatternWritable.class);
    job.setOutputValueClass(DoubleWritable.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.waitForCompletion(true);/*  w w w.j  av a2 s .c  om*/

    return 0;
}

From source file:edu.nyu.vida.data_polygamy.feature_identification.IndexCreation.java

License:BSD License

/**
 * @param args/*ww  w .j  ava2 s  . c  o m*/
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the index and events " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option thresholdOption = new Option("t", "use-custom-thresholds", false,
            "use custom thresholds for regular and rare events, defined in HDFS_HOME/"
                    + FrameworkUtils.thresholdDir + " file");
    thresholdOption.setRequired(false);
    options.addOption(thresholdOption);

    Option gOption = new Option("g", "group", true,
            "set group of datasets for which the indices and events" + " will be computed");
    gOption.setRequired(true);
    gOption.setArgName("GROUP");
    gOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(gOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp("hadoop jar data-polygamy.jar "
                    + "edu.nyu.vida.data_polygamy.feature_identification.IndexCreation", options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    String datasetNames = "";
    String datasetIds = "";

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> shortDatasetIndex = new ArrayList<String>();
    HashMap<String, String> datasetAgg = new HashMap<String, String>();
    HashMap<String, String> datasetId = new HashMap<String, String>();
    HashMap<String, HashMap<Integer, Double>> datasetRegThreshold = new HashMap<String, HashMap<Integer, Double>>();
    HashMap<String, HashMap<Integer, Double>> datasetRareThreshold = new HashMap<String, HashMap<Integer, Double>>();

    Path path = null;
    FileSystem fs = FileSystem.get(new Configuration());
    BufferedReader br;

    boolean removeExistingFiles = cmd.hasOption("f");
    boolean isThresholdUserDefined = cmd.hasOption("t");

    for (String dataset : cmd.getOptionValues("g")) {

        // getting aggregates
        String[] aggregate = FrameworkUtils.searchAggregates(dataset, s3conf, s3);
        if (aggregate.length == 0) {
            System.out.println("No aggregates found for " + dataset + ".");
            continue;
        }

        // getting aggregates header
        String aggregatesHeaderFileName = FrameworkUtils.searchAggregatesHeader(dataset, s3conf, s3);
        if (aggregatesHeaderFileName == null) {
            System.out.println("No aggregate header for " + dataset);
            continue;
        }

        String aggregatesHeader = s3bucket + FrameworkUtils.preProcessingDir + "/" + aggregatesHeaderFileName;

        shortDataset.add(dataset);
        datasetId.put(dataset, null);

        if (s3) {
            path = new Path(aggregatesHeader);
            fs = FileSystem.get(path.toUri(), s3conf);
        } else {
            path = new Path(fs.getHomeDirectory() + "/" + aggregatesHeader);
        }

        br = new BufferedReader(new InputStreamReader(fs.open(path)));
        datasetAgg.put(dataset, br.readLine().split("\t")[1]);
        br.close();
        if (s3)
            fs.close();
    }

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    // getting dataset id

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    // getting user defined thresholds

    if (isThresholdUserDefined) {
        if (s3) {
            path = new Path(s3bucket + FrameworkUtils.thresholdDir);
            fs = FileSystem.get(path.toUri(), s3conf);
        } else {
            path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.thresholdDir);
        }
        br = new BufferedReader(new InputStreamReader(fs.open(path)));
        line = br.readLine();
        while (line != null) {
            // getting dataset name
            String dataset = line.trim();
            HashMap<Integer, Double> regThresholds = new HashMap<Integer, Double>();
            HashMap<Integer, Double> rareThresholds = new HashMap<Integer, Double>();
            line = br.readLine();
            while ((line != null) && (line.split("\t").length > 1)) {
                // getting attribute ids and thresholds
                String[] keyVals = line.trim().split("\t");
                int att = Integer.parseInt(keyVals[0].trim());
                regThresholds.put(att, Double.parseDouble(keyVals[1].trim()));
                rareThresholds.put(att, Double.parseDouble(keyVals[2].trim()));
                line = br.readLine();
            }
            datasetRegThreshold.put(dataset, regThresholds);
            datasetRareThreshold.put(dataset, rareThresholds);
        }
        br.close();
    }
    if (s3)
        fs.close();

    // datasets that will use existing merge tree
    ArrayList<String> useMergeTree = new ArrayList<String>();

    // creating index for each spatio-temporal resolution

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.indexDir, s3conf, s3);

    HashSet<String> input = new HashSet<String>();

    for (String dataset : shortDataset) {

        String indexCreationOutputFileName = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/";
        String mergeTreeFileName = s3bucket + FrameworkUtils.mergeTreeDir + "/" + dataset + "/";

        if (removeExistingFiles) {
            FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3);
            FrameworkUtils.removeFile(mergeTreeFileName, s3conf, s3);
            FrameworkUtils.createDir(mergeTreeFileName, s3conf, s3);
        } else if (datasetRegThreshold.containsKey(dataset)) {
            FrameworkUtils.removeFile(indexCreationOutputFileName, s3conf, s3);
            if (FrameworkUtils.fileExists(mergeTreeFileName, s3conf, s3)) {
                useMergeTree.add(dataset);
            }
        }

        if (!FrameworkUtils.fileExists(indexCreationOutputFileName, s3conf, s3)) {
            input.add(s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset);
            shortDatasetIndex.add(dataset);
        }

    }

    if (input.isEmpty()) {
        System.out.println("All the input datasets have indices.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    String aggregateDatasets = "";
    it = input.iterator();
    while (it.hasNext()) {
        aggregateDatasets += it.next() + ",";
    }

    Job icJob = null;
    Configuration icConf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    String jobName = "index";
    String indexOutputDir = s3bucket + FrameworkUtils.indexDir + "/tmp/";

    FrameworkUtils.removeFile(indexOutputDir, s3conf, s3);

    icConf.set("dataset-name", datasetNames);
    icConf.set("dataset-id", datasetIds);

    if (!useMergeTree.isEmpty()) {
        String useMergeTreeStr = "";
        for (String dt : useMergeTree) {
            useMergeTreeStr += dt + ",";
        }
        icConf.set("use-merge-tree", useMergeTreeStr.substring(0, useMergeTreeStr.length() - 1));
    }

    for (int i = 0; i < shortDataset.size(); i++) {
        String dataset = shortDataset.get(i);
        String id = datasetId.get(dataset);
        icConf.set("dataset-" + id + "-aggregates", datasetAgg.get(dataset));
        if (datasetRegThreshold.containsKey(dataset)) {
            HashMap<Integer, Double> regThresholds = datasetRegThreshold.get(dataset);
            String thresholds = "";
            for (int att : regThresholds.keySet()) {
                thresholds += String.valueOf(att) + "-" + String.valueOf(regThresholds.get(att)) + ",";
            }
            icConf.set("regular-" + id, thresholds.substring(0, thresholds.length() - 1));
        }

        if (datasetRareThreshold.containsKey(dataset)) {
            HashMap<Integer, Double> rareThresholds = datasetRareThreshold.get(dataset);
            String thresholds = "";
            for (int att : rareThresholds.keySet()) {
                thresholds += String.valueOf(att) + "-" + String.valueOf(rareThresholds.get(att)) + ",";
            }
            icConf.set("rare-" + id, thresholds.substring(0, thresholds.length() - 1));
        }
    }

    icConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    icConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    icConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    icConf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    icConf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    icConf.set("mapreduce.task.io.sort.mb", "200");
    icConf.set("mapreduce.task.io.sort.factor", "100");
    //icConf.set("mapreduce.task.timeout", "1800000");
    machineConf.setMachineConfiguration(icConf);

    if (s3) {
        machineConf.setMachineConfiguration(icConf);
        icConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        icConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        icConf.set("bucket", s3bucket);
    }

    if (snappyCompression) {
        icConf.set("mapreduce.map.output.compress", "true");
        icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        icConf.set("mapreduce.map.output.compress", "true");
        icConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //icConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    icJob = new Job(icConf);
    icJob.setJobName(jobName);

    icJob.setMapOutputKeyClass(AttributeResolutionWritable.class);
    icJob.setMapOutputValueClass(SpatioTemporalFloatWritable.class);
    icJob.setOutputKeyClass(AttributeResolutionWritable.class);
    icJob.setOutputValueClass(TopologyTimeSeriesWritable.class);
    //icJob.setOutputKeyClass(Text.class);
    //icJob.setOutputValueClass(Text.class);

    icJob.setMapperClass(IndexCreationMapper.class);
    icJob.setReducerClass(IndexCreationReducer.class);
    icJob.setNumReduceTasks(machineConf.getNumberReduces());

    icJob.setInputFormatClass(SequenceFileInputFormat.class);
    //icJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(icJob, SequenceFileOutputFormat.class);
    //LazyOutputFormat.setOutputFormatClass(icJob, TextOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(icJob, true);
    SequenceFileOutputFormat.setOutputCompressionType(icJob, CompressionType.BLOCK);

    FileInputFormat.setInputDirRecursive(icJob, true);
    FileInputFormat.setInputPaths(icJob, aggregateDatasets.substring(0, aggregateDatasets.length() - 1));
    FileOutputFormat.setOutputPath(icJob, new Path(indexOutputDir));

    icJob.setJarByClass(IndexCreation.class);

    long start = System.currentTimeMillis();
    icJob.submit();
    icJob.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (String dataset : shortDatasetIndex) {
        String from = s3bucket + FrameworkUtils.indexDir + "/tmp/" + dataset + "/";
        String to = s3bucket + FrameworkUtils.indexDir + "/" + dataset + "/";
        FrameworkUtils.renameFile(from, to, s3conf, s3);
    }

}

From source file:edu.nyu.vida.data_polygamy.pre_processing.PreProcessing.java

License:BSD License

/**
 * @param args/*ww w.  java2  s  .  c om*/
 * @throws IOException 
 * @throws ClassNotFoundException 
 * @throws InterruptedException 
 */
@SuppressWarnings("deprecation")
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option nameOption = new Option("dn", "name", true, "the name of the dataset");
    nameOption.setRequired(true);
    nameOption.setArgName("DATASET NAME");
    options.addOption(nameOption);

    Option headerOption = new Option("dh", "header", true, "the file that contains the header of the dataset");
    headerOption.setRequired(true);
    headerOption.setArgName("DATASET HEADER FILE");
    options.addOption(headerOption);

    Option deafultsOption = new Option("dd", "defaults", true,
            "the file that contains the default values of the dataset");
    deafultsOption.setRequired(true);
    deafultsOption.setArgName("DATASET DEFAULTS FILE");
    options.addOption(deafultsOption);

    Option tempResOption = new Option("t", "temporal", true,
            "desired temporal resolution (hour, day, week, or month)");
    tempResOption.setRequired(true);
    tempResOption.setArgName("TEMPORAL RESOLUTION");
    options.addOption(tempResOption);

    Option spatialResOption = new Option("s", "spatial", true,
            "desired spatial resolution (points, nbhd, zip, grid, or city)");
    spatialResOption.setRequired(true);
    spatialResOption.setArgName("SPATIAL RESOLUTION");
    options.addOption(spatialResOption);

    Option currentSpatialResOption = new Option("cs", "current-spatial", true,
            "current spatial resolution (points, nbhd, zip, grid, or city)");
    currentSpatialResOption.setRequired(true);
    currentSpatialResOption.setArgName("CURRENT SPATIAL RESOLUTION");
    options.addOption(currentSpatialResOption);

    Option indexResOption = new Option("i", "index", true, "indexes of the temporal and spatial attributes");
    indexResOption.setRequired(true);
    indexResOption.setArgName("INDEX OF SPATIO-TEMPORAL RESOLUTIONS");
    indexResOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(indexResOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.pre_processing.PreProcessing",
                options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp(
                "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.pre_processing.PreProcessing",
                options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar " + "edu.nyu.vida.data_polygamy.pre_processing.PreProcessing",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    Configuration conf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);
    String dataset = cmd.getOptionValue("dn");
    String header = cmd.getOptionValue("dh");
    String defaults = cmd.getOptionValue("dd");
    String temporalResolution = cmd.getOptionValue("t");
    String spatialResolution = cmd.getOptionValue("s");
    String gridResolution = "";
    String currentSpatialResolution = cmd.getOptionValue("cs");

    if (spatialResolution.contains("grid")) {
        String[] res = spatialResolution.split("-");
        spatialResolution = res[0];
        gridResolution = res[1];
    }

    conf.set("header", s3bucket + FrameworkUtils.dataDir + "/" + header);
    conf.set("defaults", s3bucket + FrameworkUtils.dataDir + "/" + defaults);
    conf.set("temporal-resolution", temporalResolution);
    conf.set("spatial-resolution", spatialResolution);
    conf.set("grid-resolution", gridResolution);
    conf.set("current-spatial-resolution", currentSpatialResolution);

    String[] indexes = cmd.getOptionValues("i");
    String temporalPos = "";
    Integer sizeSpatioTemp = 0;
    if (!(currentSpatialResolution.equals("points"))) {
        String spatialPos = "";
        for (int i = 0; i < indexes.length; i++) {
            temporalPos += indexes[i] + ",";
            spatialPos += indexes[++i] + ",";
            sizeSpatioTemp++;
        }
        conf.set("spatial-pos", spatialPos);
    } else {
        String xPositions = "", yPositions = "";
        for (int i = 0; i < indexes.length; i++) {
            temporalPos += indexes[i] + ",";
            xPositions += indexes[++i] + ",";
            yPositions += indexes[++i] + ",";
            sizeSpatioTemp++;
        }
        conf.set("xPositions", xPositions);
        conf.set("yPositions", yPositions);
    }
    conf.set("temporal-pos", temporalPos);

    conf.set("size-spatio-temporal", sizeSpatioTemp.toString());

    // checking resolutions

    if (utils.spatialResolution(spatialResolution) < 0) {
        System.out.println("Invalid spatial resolution: " + spatialResolution);
        System.exit(-1);
    }

    if (utils.spatialResolution(spatialResolution) == FrameworkUtils.POINTS) {
        System.out.println("The data needs to be reduced at least to neighborhoods or grid.");
        System.exit(-1);
    }

    if (utils.spatialResolution(currentSpatialResolution) < 0) {
        System.out.println("Invalid spatial resolution: " + currentSpatialResolution);
        System.exit(-1);
    }

    if (utils.spatialResolution(currentSpatialResolution) > utils.spatialResolution(spatialResolution)) {
        System.out.println("The current spatial resolution is coarser than "
                + "the desired one. You can only navigate from a fine resolution" + " to a coarser one.");
        System.exit(-1);
    }

    if (utils.temporalResolution(temporalResolution) < 0) {
        System.out.println("Invalid temporal resolution: " + temporalResolution);
        System.exit(-1);
    }

    String fileName = s3bucket + FrameworkUtils.preProcessingDir + "/" + dataset + "-" + temporalResolution
            + "-" + spatialResolution + gridResolution;
    conf.set("aggregates", fileName + ".aggregates");

    // making sure both files are removed, if they exist
    FrameworkUtils.removeFile(fileName, s3conf, s3);
    FrameworkUtils.removeFile(fileName + ".aggregates", s3conf, s3);

    /**
     * Hadoop Parameters
     * sources: http://www.slideshare.net/ImpetusInfo/ppt-on-advanced-hadoop-tuning-n-optimisation
     *          https://cloudcelebrity.wordpress.com/2013/08/14/12-key-steps-to-keep-your-hadoop-cluster-running-strong-and-performing-optimum/
     */

    conf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    conf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    conf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    conf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    conf.set("mapreduce.task.io.sort.mb", "200");
    conf.set("mapreduce.task.io.sort.factor", "100");

    // using SnappyCodec for intermediate and output data ?
    // TODO: for now, using SnappyCodec -- what about LZO + Protocol Buffer serialization?
    //   LZO - http://www.oberhumer.com/opensource/lzo/#download
    //   Hadoop-LZO - https://github.com/twitter/hadoop-lzo
    //   Protocol Buffer - https://github.com/twitter/elephant-bird
    //   General Info - http://www.devx.com/Java/Article/47913
    //   Compression - http://comphadoop.weebly.com/index.html
    if (snappyCompression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        conf.set("mapreduce.output.fileoutputformat.compress.codec",
                "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        conf.set("mapreduce.map.output.compress", "true");
        conf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        conf.set("mapreduce.output.fileoutputformat.compress.codec",
                "org.apache.hadoop.io.compress.BZip2Codec");
    }

    // TODO: this is dangerous!
    if (s3) {
        conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
    }

    Job job = new Job(conf);
    job.setJobName(dataset + "-" + temporalResolution + "-" + spatialResolution);

    job.setMapOutputKeyClass(MultipleSpatioTemporalWritable.class);
    job.setMapOutputValueClass(AggregationArrayWritable.class);

    job.setOutputKeyClass(MultipleSpatioTemporalWritable.class);
    job.setOutputValueClass(AggregationArrayWritable.class);

    job.setMapperClass(PreProcessingMapper.class);
    job.setCombinerClass(PreProcessingCombiner.class);
    job.setReducerClass(PreProcessingReducer.class);
    job.setNumReduceTasks(machineConf.getNumberReduces());
    //job.setNumReduceTasks(1);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    FileInputFormat.setInputPaths(job, new Path(s3bucket + FrameworkUtils.dataDir + "/" + dataset));
    FileOutputFormat.setOutputPath(job, new Path(fileName));

    job.setJarByClass(PreProcessing.class);

    long start = System.currentTimeMillis();
    job.submit();
    job.waitForCompletion(true);
    System.out.println(fileName + "\t" + (System.currentTimeMillis() - start));

}

From source file:edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation.java

License:BSD License

/**
 * @param args/*from  www  .j a va  2  s  . c o  m*/
 */
@SuppressWarnings({ "deprecation" })
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    Options options = new Options();

    Option forceOption = new Option("f", "force", false,
            "force the computation of the aggregate functions " + "even if files already exist");
    forceOption.setRequired(false);
    options.addOption(forceOption);

    Option gOption = new Option("g", "group", true, "set group of datasets for which the aggregate functions"
            + " will be computed, followed by their temporal and spatial attribute indices");
    gOption.setRequired(true);
    gOption.setArgName("GROUP");
    gOption.setArgs(Option.UNLIMITED_VALUES);
    options.addOption(gOption);

    Option machineOption = new Option("m", "machine", true, "machine identifier");
    machineOption.setRequired(true);
    machineOption.setArgName("MACHINE");
    machineOption.setArgs(1);
    options.addOption(machineOption);

    Option nodesOption = new Option("n", "nodes", true, "number of nodes");
    nodesOption.setRequired(true);
    nodesOption.setArgName("NODES");
    nodesOption.setArgs(1);
    options.addOption(nodesOption);

    Option s3Option = new Option("s3", "s3", false, "data on Amazon S3");
    s3Option.setRequired(false);
    options.addOption(s3Option);

    Option awsAccessKeyIdOption = new Option("aws_id", "aws-id", true,
            "aws access key id; " + "this is required if the execution is on aws");
    awsAccessKeyIdOption.setRequired(false);
    awsAccessKeyIdOption.setArgName("AWS-ACCESS-KEY-ID");
    awsAccessKeyIdOption.setArgs(1);
    options.addOption(awsAccessKeyIdOption);

    Option awsSecretAccessKeyOption = new Option("aws_key", "aws-id", true,
            "aws secrect access key; " + "this is required if the execution is on aws");
    awsSecretAccessKeyOption.setRequired(false);
    awsSecretAccessKeyOption.setArgName("AWS-SECRET-ACCESS-KEY");
    awsSecretAccessKeyOption.setArgs(1);
    options.addOption(awsSecretAccessKeyOption);

    Option bucketOption = new Option("b", "s3-bucket", true,
            "bucket on s3; " + "this is required if the execution is on aws");
    bucketOption.setRequired(false);
    bucketOption.setArgName("S3-BUCKET");
    bucketOption.setArgs(1);
    options.addOption(bucketOption);

    Option helpOption = new Option("h", "help", false, "display this message");
    helpOption.setRequired(false);
    options.addOption(helpOption);

    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cmd = null;

    try {
        cmd = parser.parse(options, args);
    } catch (ParseException e) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true);
        System.exit(0);
    }

    if (cmd.hasOption("h")) {
        formatter.printHelp("hadoop jar data-polygamy.jar "
                + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation", options, true);
        System.exit(0);
    }

    boolean s3 = cmd.hasOption("s3");
    String s3bucket = "";
    String awsAccessKeyId = "";
    String awsSecretAccessKey = "";

    if (s3) {
        if ((!cmd.hasOption("aws_id")) || (!cmd.hasOption("aws_key")) || (!cmd.hasOption("b"))) {
            System.out.println(
                    "Arguments 'aws_id', 'aws_key', and 'b'" + " are mandatory if execution is on AWS.");
            formatter.printHelp(
                    "hadoop jar data-polygamy.jar "
                            + "edu.nyu.vida.data_polygamy.scalar_function_computation.Aggregation",
                    options, true);
            System.exit(0);
        }
        s3bucket = cmd.getOptionValue("b");
        awsAccessKeyId = cmd.getOptionValue("aws_id");
        awsSecretAccessKey = cmd.getOptionValue("aws_key");
    }

    boolean snappyCompression = false;
    boolean bzip2Compression = false;
    String machine = cmd.getOptionValue("m");
    int nbNodes = Integer.parseInt(cmd.getOptionValue("n"));

    Configuration s3conf = new Configuration();
    if (s3) {
        s3conf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        s3conf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
        s3conf.set("bucket", s3bucket);
    }

    String datasetNames = "";
    String datasetIds = "";
    String preProcessingDatasets = "";

    ArrayList<String> shortDataset = new ArrayList<String>();
    ArrayList<String> shortDatasetAggregation = new ArrayList<String>();
    HashMap<String, String> datasetTempAtt = new HashMap<String, String>();
    HashMap<String, String> datasetSpatialAtt = new HashMap<String, String>();
    HashMap<String, String> preProcessingDataset = new HashMap<String, String>();
    HashMap<String, String> datasetId = new HashMap<String, String>();

    boolean removeExistingFiles = cmd.hasOption("f");
    String[] datasetArgs = cmd.getOptionValues("g");

    for (int i = 0; i < datasetArgs.length; i += 3) {
        String dataset = datasetArgs[i];

        // getting pre-processing
        String tempPreProcessing = FrameworkUtils.searchPreProcessing(dataset, s3conf, s3);
        if (tempPreProcessing == null) {
            System.out.println("No pre-processing available for " + dataset);
            continue;
        }
        preProcessingDataset.put(dataset, tempPreProcessing);

        shortDataset.add(dataset);
        datasetTempAtt.put(dataset, ((datasetArgs[i + 1] == "null") ? null : datasetArgs[i + 1]));
        datasetSpatialAtt.put(dataset, ((datasetArgs[i + 2] == "null") ? null : datasetArgs[i + 2]));

        datasetId.put(dataset, null);
    }

    if (shortDataset.size() == 0) {
        System.out.println("No datasets to process.");
        System.exit(0);
    }

    // getting dataset id

    Path path = null;
    FileSystem fs = null;

    if (s3) {
        path = new Path(s3bucket + FrameworkUtils.datasetsIndexDir);
        fs = FileSystem.get(path.toUri(), s3conf);
    } else {
        fs = FileSystem.get(new Configuration());
        path = new Path(fs.getHomeDirectory() + "/" + FrameworkUtils.datasetsIndexDir);
    }
    BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
    String line = br.readLine();
    while (line != null) {
        String[] dt = line.split("\t");
        if (datasetId.containsKey(dt[0])) {
            datasetId.put(dt[0], dt[1]);
            datasetNames += dt[0] + ",";
            datasetIds += dt[1] + ",";
        }
        line = br.readLine();
    }
    br.close();
    if (s3)
        fs.close();

    datasetNames = datasetNames.substring(0, datasetNames.length() - 1);
    datasetIds = datasetIds.substring(0, datasetIds.length() - 1);
    Iterator<String> it = shortDataset.iterator();
    while (it.hasNext()) {
        String dataset = it.next();
        if (datasetId.get(dataset) == null) {
            System.out.println("No dataset id for " + dataset);
            System.exit(0);
        }
    }

    FrameworkUtils.createDir(s3bucket + FrameworkUtils.aggregatesDir, s3conf, s3);

    // getting smallest resolution

    HashMap<String, String> tempResMap = new HashMap<String, String>();
    HashMap<String, String> spatialResMap = new HashMap<String, String>();

    HashMap<String, String> datasetTemporalStrMap = new HashMap<String, String>();
    HashMap<String, String> datasetSpatialStrMap = new HashMap<String, String>();

    HashSet<String> input = new HashSet<String>();

    for (String dataset : shortDataset) {

        String[] datasetArray = preProcessingDataset.get(dataset).split("-");

        String datasetTemporalStr = datasetArray[datasetArray.length - 2];
        int datasetTemporal = utils.temporalResolution(datasetTemporalStr);

        String datasetSpatialStr = datasetArray[datasetArray.length - 1];
        int datasetSpatial = utils.spatialResolution(datasetSpatialStr);

        // finding all possible resolutions

        String[] temporalResolutions = FrameworkUtils.getAggTempResolutions(datasetTemporal);
        String[] spatialResolutions = FrameworkUtils.getAggSpatialResolutions(datasetSpatial);

        String temporalResolution = "";
        String spatialResolution = "";

        String tempRes = "";
        String spatialRes = "";

        boolean dataAdded = false;

        for (int i = 0; i < temporalResolutions.length; i++) {
            for (int j = 0; j < spatialResolutions.length; j++) {

                temporalResolution = temporalResolutions[i];
                spatialResolution = spatialResolutions[j];

                String aggregatesOutputFileName = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/";

                if (removeExistingFiles) {
                    FrameworkUtils.removeFile(aggregatesOutputFileName, s3conf, s3);
                }

                if (!FrameworkUtils.fileExists(aggregatesOutputFileName, s3conf, s3)) {

                    dataAdded = true;

                    tempRes += temporalResolution + "-";
                    spatialRes += spatialResolution + "-";
                }
            }
        }

        if (dataAdded) {
            input.add(s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
            shortDatasetAggregation.add(dataset);

            tempResMap.put(dataset, tempRes.substring(0, tempRes.length() - 1));
            spatialResMap.put(dataset, spatialRes.substring(0, spatialRes.length() - 1));

            datasetTemporalStrMap.put(dataset, datasetTemporalStr);
            datasetSpatialStrMap.put(dataset, datasetSpatialStr);
        }
    }

    if (input.isEmpty()) {
        System.out.println("All the input datasets have aggregates.");
        System.out.println("Use -f in the beginning of the command line to force the computation.");
        System.exit(0);
    }

    it = input.iterator();
    while (it.hasNext()) {
        preProcessingDatasets += it.next() + ",";
    }

    Job aggJob = null;
    String aggregatesOutputDir = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/";
    String jobName = "aggregates";

    FrameworkUtils.removeFile(aggregatesOutputDir, s3conf, s3);

    Configuration aggConf = new Configuration();
    Machine machineConf = new Machine(machine, nbNodes);

    aggConf.set("dataset-name", datasetNames);
    aggConf.set("dataset-id", datasetIds);

    for (int i = 0; i < shortDatasetAggregation.size(); i++) {
        String dataset = shortDatasetAggregation.get(i);
        String id = datasetId.get(dataset);
        aggConf.set("dataset-" + id + "-temporal-resolutions", tempResMap.get(dataset));
        aggConf.set("dataset-" + id + "-spatial-resolutions", spatialResMap.get(dataset));
        aggConf.set("dataset-" + id + "-temporal-att", datasetTempAtt.get(dataset));
        aggConf.set("dataset-" + id + "-spatial-att", datasetSpatialAtt.get(dataset));
        aggConf.set("dataset-" + id + "-temporal", datasetTemporalStrMap.get(dataset));
        aggConf.set("dataset-" + id + "-spatial", datasetSpatialStrMap.get(dataset));

        if (s3)
            aggConf.set("dataset-" + id,
                    s3bucket + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
        else
            aggConf.set("dataset-" + id, FileSystem.get(new Configuration()).getHomeDirectory() + "/"
                    + FrameworkUtils.preProcessingDir + "/" + preProcessingDataset.get(dataset));
    }

    aggConf.set("mapreduce.tasktracker.map.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    aggConf.set("mapreduce.tasktracker.reduce.tasks.maximum", String.valueOf(machineConf.getMaximumTasks()));
    aggConf.set("mapreduce.jobtracker.maxtasks.perjob", "-1");
    aggConf.set("mapreduce.reduce.shuffle.parallelcopies", "20");
    aggConf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    aggConf.set("mapreduce.task.io.sort.mb", "200");
    aggConf.set("mapreduce.task.io.sort.factor", "100");
    machineConf.setMachineConfiguration(aggConf);

    if (s3) {
        machineConf.setMachineConfiguration(aggConf);
        aggConf.set("fs.s3.awsAccessKeyId", awsAccessKeyId);
        aggConf.set("fs.s3.awsSecretAccessKey", awsSecretAccessKey);
    }

    if (snappyCompression) {
        aggConf.set("mapreduce.map.output.compress", "true");
        aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
        //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.SnappyCodec");
    }
    if (bzip2Compression) {
        aggConf.set("mapreduce.map.output.compress", "true");
        aggConf.set("mapreduce.map.output.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
        //aggConf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.BZip2Codec");
    }

    aggJob = new Job(aggConf);
    aggJob.setJobName(jobName);

    aggJob.setMapOutputKeyClass(SpatioTemporalWritable.class);
    aggJob.setMapOutputValueClass(AggregationArrayWritable.class);
    aggJob.setOutputKeyClass(SpatioTemporalWritable.class);
    aggJob.setOutputValueClass(FloatArrayWritable.class);
    //aggJob.setOutputKeyClass(Text.class);
    //aggJob.setOutputValueClass(Text.class);

    aggJob.setMapperClass(AggregationMapper.class);
    aggJob.setCombinerClass(AggregationCombiner.class);
    aggJob.setReducerClass(AggregationReducer.class);
    aggJob.setNumReduceTasks(machineConf.getNumberReduces());

    aggJob.setInputFormatClass(SequenceFileInputFormat.class);
    //aggJob.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(aggJob, SequenceFileOutputFormat.class);
    //LazyOutputFormat.setOutputFormatClass(aggJob, TextOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(aggJob, true);
    SequenceFileOutputFormat.setOutputCompressionType(aggJob, CompressionType.BLOCK);

    FileInputFormat.setInputDirRecursive(aggJob, true);
    FileInputFormat.setInputPaths(aggJob,
            preProcessingDatasets.substring(0, preProcessingDatasets.length() - 1));
    FileOutputFormat.setOutputPath(aggJob, new Path(aggregatesOutputDir));

    aggJob.setJarByClass(Aggregation.class);

    long start = System.currentTimeMillis();
    aggJob.submit();
    aggJob.waitForCompletion(true);
    System.out.println(jobName + "\t" + (System.currentTimeMillis() - start));

    // moving files to right place
    for (String dataset : shortDatasetAggregation) {
        String from = s3bucket + FrameworkUtils.aggregatesDir + "/tmp/" + dataset + "/";
        String to = s3bucket + FrameworkUtils.aggregatesDir + "/" + dataset + "/";
        FrameworkUtils.renameFile(from, to, s3conf, s3);
    }

}