Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:it.polito.dbdmg.searum.ARM.java

License:Apache License

/**
 * Run the rule aggregator job over mined rules.
 * // w ww  . j  a v a  2 s.c o  m
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static void startRuleAggregating(Parameters params, Configuration conf)
        throws IOException, ClassNotFoundException, InterruptedException {
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    Path input = new Path(params.get(OUTPUT), RULES);
    Job job = new Job(conf, "Rule aggregator driver running over input: " + input);
    job.setJarByClass(ARM.class);
    FileInputFormat.addInputPath(job, input);
    Path outPath = new Path(params.get(OUTPUT), RULESBYCONCLUSION);
    FileOutputFormat.setOutputPath(job, outPath);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(RuleAggregatorMapper.class);
    job.setReducerClass(RuleAggregatorReducer.class);
    job.setPartitionerClass(RulePartitionerByConclusion.class);
    job.setSortComparatorClass(RulesWritableComparator.class);
    job.setGroupingComparatorClass(RulesGroupingWritableComparator.class);

    HadoopUtil.delete(conf, outPath);
    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:ivory.core.index.BuildIPInvertedIndexDocSorted.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    String postingsType = conf.get(Constants.PostingsListsType,
            ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName());
    @SuppressWarnings("unchecked")
    Class<? extends PostingsList> postingsClass = (Class<? extends PostingsList>) Class.forName(postingsType);

    LOG.info("PowerTool: " + BuildIPInvertedIndexDocSorted.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCnt));
    LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName()));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
    LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize));

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }//from   w ww. j a v a 2s .  c om

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Job job = new Job(conf, BuildIPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildIPInvertedIndexDocSorted.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, postingsPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(TermPositions.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(postingsClass);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType(postingsClass.getCanonicalName());

    return 0;
}

From source file:kogiri.mapreduce.libra.kmersimilarity_r.KmerSimilarityReduce.java

License:Open Source License

private int runJob(LibraConfig lConfig) throws Exception {
    // check config
    validateLibraConfig(lConfig);/*from  ww  w.  j  a  v  a2s .c o  m*/

    // configuration
    Configuration conf = this.getConf();

    Job job = new Job(conf, "Kogiri Libra - Computing similarity between samples");
    conf = job.getConfiguration();

    // set user configuration
    lConfig.getClusterConfiguration().configureTo(conf);
    lConfig.saveTo(conf);

    job.setJarByClass(KmerSimilarityReduce.class);

    // Mapper
    job.setMapperClass(KmerSimilarityMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(CompressedSequenceWritable.class);
    job.setMapOutputValueClass(CompressedIntArrayWritable.class);

    // Partitioner
    job.setPartitionerClass(KmerSimilarityPartitioner.class);

    // Reducer
    job.setReducerClass(KmerSimilarityReducer.class);

    // Specify key / value
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Inputs
    Path[] kmerIndexFiles = KmerIndexHelper.getAllKmerIndexIndexFilePath(conf, lConfig.getKmerIndexPath());
    List<Path> indexPartFileArray = new ArrayList<Path>();
    for (Path kmerIndexFile : kmerIndexFiles) {
        Path[] inputKmerIndexPartFiles = KmerIndexHelper.getKmerIndexPartFilePath(conf, kmerIndexFile);
        for (Path indexPartFile : inputKmerIndexPartFiles) {
            Path[] kmerIndexPartDataFiles = KmerIndexHelper.getAllKmerIndexPartDataFilePath(conf,
                    indexPartFile);
            for (Path kmerIndexPartDataFile : kmerIndexPartDataFiles) {
                indexPartFileArray.add(kmerIndexPartDataFile);
            }
        }
    }

    SequenceFileInputFormat.addInputPaths(job,
            FileSystemHelper.makeCommaSeparated(indexPartFileArray.toArray(new Path[0])));

    LOG.info("Input kmer index files : " + kmerIndexFiles.length);
    for (Path inputFile : kmerIndexFiles) {
        LOG.info("> " + inputFile.toString());
    }

    int kmerSize = 0;
    for (Path inputFile : kmerIndexFiles) {
        // check kmerSize
        int myKmerSize = KmerIndexHelper.getKmerSize(inputFile);
        if (kmerSize == 0) {
            kmerSize = myKmerSize;
        } else {
            if (kmerSize != myKmerSize) {
                throw new Exception("kmer size must be the same over all given kmer indices");
            }
        }
    }

    KmerMatchFileMapping fileMapping = new KmerMatchFileMapping();
    for (Path kmerIndexFile : kmerIndexFiles) {
        String fastaFilename = KmerIndexHelper.getFastaFileName(kmerIndexFile.getName());
        fileMapping.addFastaFile(fastaFilename);
    }
    fileMapping.saveTo(conf);

    int MRNodes = MapReduceClusterHelper.getNodeNum(conf);

    LOG.info("MapReduce nodes detected : " + MRNodes);

    FileOutputFormat.setOutputPath(job, new Path(lConfig.getOutputPath()));
    job.setOutputFormatClass(TextOutputFormat.class);

    // Reducer
    // Use many reducers
    int reducersPerNode = lConfig.getClusterConfiguration().getMachineCores() / 2;
    if (reducersPerNode < 1) {
        reducersPerNode = 1;
    }
    int reducers = lConfig.getClusterConfiguration().getMachineNum()
            * (lConfig.getClusterConfiguration().getMachineCores() / 2);
    LOG.info("Reducers : " + reducers);
    job.setNumReduceTasks(reducers);

    // Execute job and return status
    boolean result = job.waitForCompletion(true);

    // commit results
    if (result) {
        commit(new Path(lConfig.getOutputPath()), conf);

        Path tableFilePath = new Path(lConfig.getOutputPath(),
                KmerSimilarityHelper.makeKmerSimilarityTableFileName());
        FileSystem fs = tableFilePath.getFileSystem(conf);
        fileMapping.saveTo(fs, tableFilePath);

        // combine results
        sumScores(new Path(lConfig.getOutputPath()), conf);
    }

    // report
    if (lConfig.getReportPath() != null && !lConfig.getReportPath().isEmpty()) {
        Report report = new Report();
        report.addJob(job);
        report.writeTo(lConfig.getReportPath());
    }

    return result ? 0 : 1;
}

From source file:kogiri.mapreduce.preprocess.indexing.stage2.KmerIndexBuilder.java

License:Open Source License

private int runJob(PreprocessorConfig ppConfig) throws Exception {
    // check config
    validatePreprocessorConfig(ppConfig);

    // configuration
    Configuration conf = this.getConf();

    // set user configuration
    ppConfig.getClusterConfiguration().configureTo(conf);
    ppConfig.saveTo(conf);/*  w  ww  .j  a va 2  s  .c o m*/

    Path[] inputFiles = FileSystemHelper.getAllFastaFilePath(conf, ppConfig.getFastaPath());

    boolean job_result = true;
    List<Job> jobs = new ArrayList<Job>();

    for (int round = 0; round < inputFiles.length; round++) {
        Path roundInputFile = inputFiles[round];
        String roundOutputPath = ppConfig.getKmerIndexPath() + "_round" + round;

        Job job = new Job(conf,
                "Kogiri Preprocessor - Building Kmer Indices (" + round + " of " + inputFiles.length + ")");
        job.setJarByClass(KmerIndexBuilder.class);

        // Mapper
        job.setMapperClass(KmerIndexBuilderMapper.class);
        job.setInputFormatClass(FastaReadInputFormat.class);
        job.setMapOutputKeyClass(CompressedSequenceWritable.class);
        job.setMapOutputValueClass(CompressedIntArrayWritable.class);

        // Combiner
        job.setCombinerClass(KmerIndexBuilderCombiner.class);

        // Partitioner
        job.setPartitionerClass(KmerIndexBuilderPartitioner.class);

        // Reducer
        job.setReducerClass(KmerIndexBuilderReducer.class);

        // Specify key / value
        job.setOutputKeyClass(CompressedSequenceWritable.class);
        job.setOutputValueClass(CompressedIntArrayWritable.class);

        // Inputs
        FileInputFormat.addInputPaths(job, roundInputFile.toString());

        LOG.info("Input file : ");
        LOG.info("> " + roundInputFile.toString());

        String histogramFileName = KmerHistogramHelper.makeKmerHistogramFileName(roundInputFile.getName());
        Path histogramPath = new Path(ppConfig.getKmerHistogramPath(), histogramFileName);

        KmerIndexBuilderPartitioner.setHistogramPath(job.getConfiguration(), histogramPath);

        FileOutputFormat.setOutputPath(job, new Path(roundOutputPath));
        job.setOutputFormatClass(MapFileOutputFormat.class);

        // Use many reducers
        int reducersPerNode = ppConfig.getClusterConfiguration().getMachineCores() / 2;
        if (reducersPerNode < 1) {
            reducersPerNode = 1;
        }
        int reducers = ppConfig.getClusterConfiguration().getMachineNum()
                * (ppConfig.getClusterConfiguration().getMachineCores() / 2);
        LOG.info("Reducers : " + reducers);
        job.setNumReduceTasks(reducers);

        // Execute job and return status
        boolean result = job.waitForCompletion(true);

        jobs.add(job);

        // commit results
        if (result) {
            commitRoundIndexOutputFiles(roundInputFile, new Path(roundOutputPath),
                    new Path(ppConfig.getKmerIndexPath()), job.getConfiguration(), ppConfig.getKmerSize());

            // create index of index
            createIndexOfIndex(new Path(ppConfig.getKmerIndexPath()), roundInputFile, job.getConfiguration(),
                    ppConfig.getKmerSize());
        }

        if (!result) {
            LOG.error("job failed at round " + round + " of " + inputFiles.length);
            job_result = false;
            break;
        }
    }

    // report
    if (ppConfig.getReportPath() != null && !ppConfig.getReportPath().isEmpty()) {
        Report report = new Report();
        report.addJob(jobs);
        report.writeTo(ppConfig.getReportPath());
    }

    return job_result ? 0 : 1;
}

From source file:kogiri.mapreduce.readfrequency.modecount.ModeCounter.java

License:Open Source License

private int runJob(ReadFrequencyCounterConfig rfConfig) throws Exception {
    // check config
    validateReadFrequencyCounterConfig(rfConfig);

    // configuration
    Configuration conf = this.getConf();

    // set user configuration
    rfConfig.getClusterConfiguration().configureTo(conf);
    rfConfig.saveTo(conf);/*w ww  .  j  a v a  2  s . c o  m*/

    // table file
    Path tableFilePath = new Path(rfConfig.getKmerMatchPath(), KmerMatchHelper.makeKmerMatchTableFileName());
    FileSystem fs = tableFilePath.getFileSystem(conf);
    KmerMatchFileMapping fileMapping = KmerMatchFileMapping.createInstance(fs, tableFilePath);

    Path[] inputFiles = KmerMatchHelper.getAllKmerMatchResultFilePath(conf, rfConfig.getKmerMatchPath());

    // Register named outputs
    NamedOutputs namedOutputs = new NamedOutputs();
    for (int i = 0; i < fileMapping.getSize(); i++) {
        String fastaFileName = fileMapping.getFastaFileFromID(i);
        namedOutputs.add(fastaFileName);
    }
    namedOutputs.saveTo(conf);

    boolean job_result = true;
    List<Job> jobs = new ArrayList<Job>();

    for (int round = 0; round < fileMapping.getSize(); round++) {
        String roundOutputPath = rfConfig.getReadFrequencyPath() + "_round" + round;

        Job job = new Job(conf, "Kogiri Preprocessor - Computing Mode of Kmer Frequency (" + round + " of "
                + fileMapping.getSize() + ")");
        job.setJarByClass(ModeCounter.class);

        // Mapper
        job.setMapperClass(ModeCounterMapper.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(MultiFileIntWritable.class);
        job.setMapOutputValueClass(CompressedIntArrayWritable.class);

        // Combiner
        job.setCombinerClass(ModeCounterCombiner.class);

        // Partitioner
        job.setPartitionerClass(ModeCounterPartitioner.class);

        // Reducer
        job.setReducerClass(ModeCounterReducer.class);

        // Specify key / value
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // Inputs
        FileInputFormat.addInputPaths(job, FileSystemHelper.makeCommaSeparated(inputFiles));

        ModeCounterConfig modeCounterConfig = new ModeCounterConfig();
        modeCounterConfig.setMasterFileID(round);
        modeCounterConfig.saveTo(job.getConfiguration());

        FileOutputFormat.setOutputPath(job, new Path(roundOutputPath));
        job.setOutputFormatClass(TextOutputFormat.class);

        for (NamedOutputRecord namedOutput : namedOutputs.getRecord()) {
            MultipleOutputs.addNamedOutput(job, namedOutput.getIdentifier(), TextOutputFormat.class, Text.class,
                    Text.class);
        }

        // Execute job and return status
        boolean result = job.waitForCompletion(true);

        jobs.add(job);

        // commit results
        if (result) {
            commitRoundOutputFiles(new Path(roundOutputPath), new Path(rfConfig.getReadFrequencyPath()),
                    job.getConfiguration(), namedOutputs, round);
        }

        if (!result) {
            LOG.error("job failed at round " + round + " of " + fileMapping.getSize());
            job_result = false;
            break;
        }
    }

    // report
    if (rfConfig.getReportPath() != null && !rfConfig.getReportPath().isEmpty()) {
        Report report = new Report();
        report.addJob(jobs);
        report.writeTo(rfConfig.getReportPath());
    }

    return job_result ? 0 : 1;
}

From source file:layer.AutoCoder.java

License:Apache License

/**
 * Runs this tool.//w  w  w .  j  a  v a2 s .co  m
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath0 = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + AutoCoder.class.getSimpleName());
    LOG.info(" - input path: " + inputPath0);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);

    Configuration conf = getConf();
    initialParameters(conf);

    for (int iterations = 1; iterations < GlobalUtil.NUM_LAYER + 1; iterations++) {
        LOG.info("** Layer: " + iterations);
        try {

            Job job = Job.getInstance(conf);
            job.setJobName(AutoCoder.class.getSimpleName());
            job.setJarByClass(AutoCoder.class);
            // set the path of the information of k clusters in this iteration
            job.getConfiguration().set("sidepath", inputPath0 + "/side_output");
            job.getConfiguration().setInt("layer_ind", iterations);
            job.setNumReduceTasks(reduceTasks);

            String inputPath = inputPath0 + "/train";
            dataShuffle();

            FileInputFormat.setInputPaths(job, new Path(inputPath));
            FileOutputFormat.setOutputPath(job, new Path(outputPath));

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(ModelNode.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(ModelNode.class);

            job.setMapperClass(MyMapper.class);
            job.setReducerClass(MyReducer.class);
            job.setPartitionerClass(MyPartitioner.class);

            // Delete the output directory if it exists already.
            Path outputDir = new Path(outputPath);
            FileSystem.get(getConf()).delete(outputDir, true);

            long startTime = System.currentTimeMillis();
            job.waitForCompletion(true);
            LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

            prepareNextIteration(inputPath0, outputPath, iterations, conf, reduceTasks);
        } catch (Exception exp) {
            exp.printStackTrace();
        }
    }

    return 0;
}

From source file:ldbc.snb.datagen.hadoop.HadoopPersonActivityGenerator.java

public void run(String inputFileName) throws AssertionError, Exception {

    FileSystem fs = FileSystem.get(conf);

    System.out.println("RANKING");
    String rankedFileName = conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/ranked";
    HadoopFileRanker hadoopFileRanker = new HadoopFileRanker(conf, TupleKey.class, Person.class, null);
    hadoopFileRanker.run(inputFileName, rankedFileName);

    System.out.println("GENERATING");
    int numThreads = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.numThreads"));
    Job job = Job.getInstance(conf, "Person Activity Generator/Serializer");
    job.setMapOutputKeyClass(BlockKey.class);
    job.setMapOutputValueClass(Person.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Person.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(HadoopPersonActivityGeneratorReducer.class);
    job.setNumReduceTasks(numThreads);/*from w w  w.  j a  v  a2 s. co m*/
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setSortComparatorClass(BlockKeyComparator.class);
    job.setGroupingComparatorClass(BlockKeyGroupComparator.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);

    /** PROFILING OPTIONS **/
    //job.setProfileEnabled(true);
    //job.setProfileParams("-agentlib:hprof=cpu=samples,heap=sites,depth=4,thread=y,format=b,file=%s");
    //job.setProfileTaskRange(true,"0-1");
    //job.setProfileTaskRange(false,"0-1");
    /****/

    FileInputFormat.setInputPaths(job, new Path(rankedFileName));
    FileOutputFormat.setOutputPath(job, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/aux"));
    long start = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            throw new Exception();
        }
    } catch (AssertionError e) {
        throw e;
    }
    System.out.println("Real time to generate activity: " + (System.currentTimeMillis() - start) / 1000.0f);

    try {
        fs.delete(new Path(rankedFileName), true);
        fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/aux"), true);
    } catch (IOException e) {
        System.err.println(e.getMessage());
        e.printStackTrace();
    }
}

From source file:ldbc.socialnet.dbgen.generator.MRGenerateUsers.java

License:Open Source License

public int runGenerateJob(Configuration conf) throws Exception {
    FileSystem fs = FileSystem.get(conf);
    String hadoopDir = new String(conf.get("outputDir") + "/hadoop");
    String socialNetDir = new String(conf.get("outputDir") + "/social_network");
    int numThreads = Integer.parseInt(conf.get("numThreads"));
    System.out.println("NUMBER OF THREADS " + numThreads);

    /// --------- Execute Jobs ------
    long start = System.currentTimeMillis();

    /// --------------- First job Generating users----------------
    printProgress("Starting: Person generation");
    conf.set("pass", Integer.toString(0));
    Job job = new Job(conf, "SIB Generate Users & 1st Dimension");
    job.setMapOutputKeyClass(TupleKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(GenerateUsersMapper.class);
    job.setMapperClass(GenerateUsersMapper.class);
    job.setNumReduceTasks(numThreads);/*from   w w w .  j ava  2  s. c o m*/
    job.setInputFormatClass(NLineInputFormat.class);
    conf.setInt("mapred.line.input.format.linespermap", 1);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, new Path(hadoopDir) + "/mrInputFile");
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib"));
    job.waitForCompletion(true);

    /// --------------- Sorting by first dimension  ----------------
    printProgress("Starting: Sorting by first dimension");
    HadoopFileRanker fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class);
    fileRanker.run(hadoopDir + "/sib", hadoopDir + "/sibSorting");
    fs.delete(new Path(hadoopDir + "/sib"), true);

    /// --------------- job Generating First dimension Friendships  ----------------
    printProgress("Starting: Friendship generation 1.");
    conf.set("pass", Integer.toString(0));
    conf.set("dimension", Integer.toString(1));
    job = new Job(conf, "SIB Generate Friendship - Interest");
    job.setMapOutputKeyClass(ComposedKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(DimensionReducer.class);
    job.setNumReduceTasks(numThreads);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);
    job.setSortComparatorClass(ComposedKeyComparator.class);
    job.setGroupingComparatorClass(ComposedKeyGroupComparator.class);

    FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting"));
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib2"));
    job.waitForCompletion(true);
    fs.delete(new Path(hadoopDir + "/sibSorting"), true);

    /// --------------- Sorting phase 2  ----------------
    printProgress("Starting: Sorting by second dimension");
    fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class);
    fileRanker.run(hadoopDir + "/sib2", hadoopDir + "/sibSorting2");
    fs.delete(new Path(hadoopDir + "/sib2"), true);

    /// --------------- Second job Generating Friendships  ----------------
    printProgress("Starting: Friendship generation 2.");
    conf.set("pass", Integer.toString(1));
    conf.set("dimension", Integer.toString(2));
    job = new Job(conf, "SIB Generate Friendship - Interest");
    job.setMapOutputKeyClass(ComposedKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(DimensionReducer.class);
    job.setNumReduceTasks(numThreads);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);
    job.setSortComparatorClass(ComposedKeyComparator.class);
    job.setGroupingComparatorClass(ComposedKeyGroupComparator.class);
    FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting2"));
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib3"));
    job.waitForCompletion(true);
    fs.delete(new Path(hadoopDir + "/sibSorting2"), true);

    /// --------------- Sorting phase 3--------------
    printProgress("Starting: Sorting by third dimension");
    fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class);
    fileRanker.run(hadoopDir + "/sib3", hadoopDir + "/sibSorting3");
    fs.delete(new Path(hadoopDir + "/sib3"), true);

    /// --------------- Third job Generating Friendships----------------
    printProgress("Starting: Friendship generation 3.");
    conf.set("pass", Integer.toString(2));
    conf.set("dimension", Integer.toString(2));
    job = new Job(conf, "SIB Generate Friendship - Random");
    job.setMapOutputKeyClass(ComposedKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(DimensionReducer.class);
    job.setNumReduceTasks(numThreads);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);
    job.setSortComparatorClass(ComposedKeyComparator.class);
    job.setGroupingComparatorClass(ComposedKeyGroupComparator.class);
    FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting3"));
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib4"));
    job.waitForCompletion(true);
    fs.delete(new Path(hadoopDir + "/sibSorting3"), true);

    /// --------------- Sorting phase 3--------------

    printProgress("Starting: Sorting by third dimension (for activity generation)");
    fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class);
    fileRanker.run(hadoopDir + "/sib4", hadoopDir + "/sibSorting4");
    fs.delete(new Path(hadoopDir + "/sib4"), true);

    /// --------------- Fourth job: Serialize static network ----------------

    printProgress("Starting: Generating person activity");
    job = new Job(conf, "Generate user activity");
    job.setMapOutputKeyClass(ComposedKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(UserActivityReducer.class);
    job.setNumReduceTasks(numThreads);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);
    job.setSortComparatorClass(ComposedKeyComparator.class);
    job.setGroupingComparatorClass(ComposedKeyGroupComparator.class);
    FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting4"));
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib5"));
    job.waitForCompletion(true);
    fs.delete(new Path(hadoopDir + "/sib5"), true);

    int numEvents = 0;
    long min = Long.MAX_VALUE;
    long max = Long.MIN_VALUE;

    if (conf.getBoolean("updateStreams", false)) {
        for (int i = 0; i < numThreads; ++i) {
            int numPartitions = conf.getInt("numUpdatePartitions", 1);
            for (int j = 0; j < numPartitions; ++j) {
                /// --------------- Fifth job: Sort update streams ----------------
                conf.setInt("mapred.line.input.format.linespermap", 1000000);
                conf.setInt("reducerId", i);
                conf.setInt("partitionId", j);
                conf.set("streamType", "forum");
                Job jobForum = new Job(conf, "Soring update streams " + j + " of reducer " + i);
                jobForum.setMapOutputKeyClass(LongWritable.class);
                jobForum.setMapOutputValueClass(Text.class);
                jobForum.setOutputKeyClass(LongWritable.class);
                jobForum.setOutputValueClass(Text.class);
                jobForum.setJarByClass(UpdateEventMapper.class);
                jobForum.setMapperClass(UpdateEventMapper.class);
                jobForum.setReducerClass(UpdateEventReducer.class);
                jobForum.setNumReduceTasks(1);
                jobForum.setInputFormatClass(SequenceFileInputFormat.class);
                jobForum.setOutputFormatClass(SequenceFileOutputFormat.class);
                jobForum.setPartitionerClass(UpdateEventPartitioner.class);
                FileInputFormat.addInputPath(jobForum,
                        new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_forum"));
                FileOutputFormat.setOutputPath(jobForum, new Path(hadoopDir + "/sibEnd"));
                printProgress("Starting: Sorting update streams");
                jobForum.waitForCompletion(true);
                fs.delete(new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_forum"), false);
                fs.delete(new Path(hadoopDir + "/sibEnd"), true);

                conf.setInt("mapred.line.input.format.linespermap", 1000000);
                conf.setInt("reducerId", i);
                conf.setInt("partitionId", j);
                conf.set("streamType", "person");
                Job jobPerson = new Job(conf, "Soring update streams " + j + " of reducer " + i);
                jobPerson.setMapOutputKeyClass(LongWritable.class);
                jobPerson.setMapOutputValueClass(Text.class);
                jobPerson.setOutputKeyClass(LongWritable.class);
                jobPerson.setOutputValueClass(Text.class);
                jobPerson.setJarByClass(UpdateEventMapper.class);
                jobPerson.setMapperClass(UpdateEventMapper.class);
                jobPerson.setReducerClass(UpdateEventReducer.class);
                jobPerson.setNumReduceTasks(1);
                jobPerson.setInputFormatClass(SequenceFileInputFormat.class);
                jobPerson.setOutputFormatClass(SequenceFileOutputFormat.class);
                jobPerson.setPartitionerClass(UpdateEventPartitioner.class);
                FileInputFormat.addInputPath(jobPerson,
                        new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_person"));
                FileOutputFormat.setOutputPath(jobPerson, new Path(hadoopDir + "/sibEnd"));
                printProgress("Starting: Sorting update streams");
                jobPerson.waitForCompletion(true);
                fs.delete(new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_person"), false);
                fs.delete(new Path(hadoopDir + "/sibEnd"), true);

                if (conf.getBoolean("updateStreams", false)) {
                    Properties properties = new Properties();
                    FSDataInputStream file = fs.open(new Path(conf.get("outputDir")
                            + "/social_network/updateStream_" + i + "_" + j + "_person.properties"));
                    properties.load(file);
                    if (properties.getProperty("min_write_event_start_time") != null) {
                        Long auxMin = Long.parseLong(properties.getProperty("min_write_event_start_time"));
                        min = auxMin < min ? auxMin : min;
                        Long auxMax = Long.parseLong(properties.getProperty("max_write_event_start_time"));
                        max = auxMax > max ? auxMax : max;
                        numEvents += Long.parseLong(properties.getProperty("num_events"));
                    }
                    file.close();
                    file = fs.open(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_"
                            + j + "_forum.properties"));
                    properties.load(file);
                    if (properties.getProperty("min_write_event_start_time") != null) {
                        Long auxMin = Long.parseLong(properties.getProperty("min_write_event_start_time"));
                        min = auxMin < min ? auxMin : min;
                        Long auxMax = Long.parseLong(properties.getProperty("max_write_event_start_time"));
                        max = auxMax > max ? auxMax : max;
                        numEvents += Long.parseLong(properties.getProperty("num_events"));
                    }
                    file.close();
                    fs.delete(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j
                            + "_person.properties"), true);
                    fs.delete(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j
                            + "_forum.properties"), true);
                }
            }
        }

        if (conf.getBoolean("updateStreams", false)) {
            OutputStream output = fs
                    .create(new Path(conf.get("outputDir") + "/social_network/updateStream.properties"));
            output.write(new String("ldbc.snb.interactive.gct_delta_duration:" + conf.get("deltaTime") + "\n")
                    .getBytes());
            output.write(
                    new String("ldbc.snb.interactive.min_write_event_start_time:" + min + "\n").getBytes());
            output.write(
                    new String("ldbc.snb.interactive.max_write_event_start_time:" + max + "\n").getBytes());
            output.write(new String("ldbc.snb.interactive.update_interleave:" + (max - min) / numEvents + "\n")
                    .getBytes());
            output.write(new String("ldbc.snb.interactive.num_events:" + numEvents).getBytes());
            output.close();
        }
    }

    /// --------------- Sixth job: Materialize the friends lists ----------------
    /*        Job job6 = new Job(conf,"Dump the friends lists");
            job6.setMapOutputKeyClass(ComposedKey.class);
            job6.setMapOutputValueClass(ReducedUserProfile.class);
            job6.setOutputKeyClass(ComposedKey.class);
            job6.setOutputValueClass(ReducedUserProfile.class);
            job6.setJarByClass(HadoopBlockMapper.class);
            job6.setMapperClass(HadoopBlockMapper.class);
            job6.setReducerClass(FriendListOutputReducer.class);
            job6.setNumReduceTasks(numThreads);
            job6.setInputFormatClass(SequenceFileInputFormat.class);
            job6.setOutputFormatClass(SequenceFileOutputFormat.class);
            job6.setPartitionerClass(HadoopBlockPartitioner.class);
            job6.setSortComparatorClass(ComposedKeyComparator.class);
            job6.setGroupingComparatorClass(ComposedKeyGroupComparator.class);
            FileInputFormat.setInputPaths(job6, new Path(hadoopDir + "/sibSorting4"));
            FileOutputFormat.setOutputPath(job6, new Path(hadoopDir + "/job6") );
            
            
            printProgress("Starting: Materialize friends for substitution parameters");
            int resMaterializeFriends = job6.waitForCompletion(true) ? 0 : 1;
            fs.delete(new Path(hadoopDir + "/sibSorting3"),true);
            */

    long end = System.currentTimeMillis();
    System.out.println(((end - start) / 1000) + " total seconds");
    for (int i = 0; i < numThreads; ++i) {
        fs.copyToLocalFile(new Path(socialNetDir + "/m" + i + "factors.txt"), new Path("./"));
        fs.copyToLocalFile(new Path(socialNetDir + "/m0friendList" + i + ".csv"), new Path("./"));
    }
    return 0;
}

From source file:libra.core.kmersimilarity_r.KmerSimilarityReduce.java

License:Apache License

private int runJob(CoreConfig cConfig) throws Exception {
    // check config
    validateCoreConfig(cConfig);/*w  ww.  j  av a 2s . co m*/

    // configuration
    Configuration conf = this.getConf();

    Job job = new Job(conf, "Libra Core - Computing similarity between samples");
    conf = job.getConfiguration();

    // set user configuration
    cConfig.saveTo(conf);

    job.setJarByClass(KmerSimilarityReduce.class);

    // Mapper
    job.setMapperClass(KmerSimilarityMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(CompressedSequenceWritable.class);
    job.setMapOutputValueClass(CompressedIntArrayWritable.class);

    // Partitioner
    job.setPartitionerClass(KmerSimilarityPartitioner.class);

    // Reducer
    job.setReducerClass(KmerSimilarityReducer.class);

    // Specify key / value
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Inputs
    Path[] kmerIndexFiles = KmerIndexHelper.getAllKmerIndexIndexFilePath(conf, cConfig.getKmerIndexPath());
    List<Path> indexPartFileArray = new ArrayList<Path>();
    for (Path kmerIndexFile : kmerIndexFiles) {
        Path[] inputKmerIndexPartFiles = KmerIndexHelper.getKmerIndexPartFilePath(conf, kmerIndexFile);
        for (Path indexPartFile : inputKmerIndexPartFiles) {
            Path[] kmerIndexPartDataFiles = KmerIndexHelper.getAllKmerIndexPartDataFilePath(conf,
                    indexPartFile);
            for (Path kmerIndexPartDataFile : kmerIndexPartDataFiles) {
                indexPartFileArray.add(kmerIndexPartDataFile);
            }
        }
    }

    SequenceFileInputFormat.addInputPaths(job,
            FileSystemHelper.makeCommaSeparated(indexPartFileArray.toArray(new Path[0])));

    LOG.info("Input kmer index files : " + kmerIndexFiles.length);
    for (Path inputFile : kmerIndexFiles) {
        LOG.info("> " + inputFile.toString());
    }

    int kmerSize = 0;
    for (Path inputFile : kmerIndexFiles) {
        // check kmerSize
        int myKmerSize = KmerIndexHelper.getKmerSize(inputFile);
        if (kmerSize == 0) {
            kmerSize = myKmerSize;
        } else {
            if (kmerSize != myKmerSize) {
                throw new Exception("kmer size must be the same over all given kmer indices");
            }
        }
    }

    KmerMatchFileMapping fileMapping = new KmerMatchFileMapping();
    for (Path kmerIndexFile : kmerIndexFiles) {
        String fastaFilename = KmerIndexHelper.getFastaFileName(kmerIndexFile.getName());
        fileMapping.addFastaFile(fastaFilename);
    }
    fileMapping.saveTo(conf);

    FileOutputFormat.setOutputPath(job, new Path(cConfig.getOutputPath()));
    job.setOutputFormatClass(TextOutputFormat.class);

    // Reducer
    // Use many reducers
    int reducers = conf.getInt("mapred.reduce.tasks", 0);
    if (reducers <= 0) {
        int MRNodes = MapReduceClusterHelper.getNodeNum(conf);
        reducers = MRNodes * 2;
        job.setNumReduceTasks(reducers);
    }
    LOG.info("Reducers : " + reducers);

    // Execute job and return status
    boolean result = job.waitForCompletion(true);

    // commit results
    if (result) {
        commit(new Path(cConfig.getOutputPath()), conf);

        Path tableFilePath = new Path(cConfig.getOutputPath(),
                KmerSimilarityHelper.makeKmerSimilarityTableFileName());
        FileSystem fs = tableFilePath.getFileSystem(conf);
        fileMapping.saveTo(fs, tableFilePath);

        // combine results
        sumScores(new Path(cConfig.getOutputPath()), conf);
    }

    // report
    if (cConfig.getReportPath() != null && !cConfig.getReportPath().isEmpty()) {
        Report report = new Report();
        report.addJob(job);
        report.writeTo(cConfig.getReportPath());
    }

    return result ? 0 : 1;
}

From source file:libra.preprocess.stage2.KmerIndexBuilder.java

License:Apache License

private int runJob(PreprocessorConfig ppConfig) throws Exception {
    // check config
    validatePreprocessorConfig(ppConfig);

    // configuration
    Configuration conf = this.getConf();

    // set user configuration
    ppConfig.saveTo(conf);//from   www.  ja  v a2 s . c o m

    Path[] inputFiles = FileSystemHelper.getAllFastaFilePath(conf, ppConfig.getFastaPath());

    boolean job_result = true;
    List<Job> jobs = new ArrayList<Job>();

    for (int round = 0; round < inputFiles.length; round++) {
        Path roundInputFile = inputFiles[round];
        String roundOutputPath = ppConfig.getKmerIndexPath() + "_round" + round;

        Job job = new Job(conf,
                "Libra Preprocessor - Building Kmer Indexes (" + round + " of " + inputFiles.length + ")");
        job.setJarByClass(KmerIndexBuilder.class);

        // Mapper
        job.setMapperClass(KmerIndexBuilderMapper.class);
        FastaKmerInputFormat.setKmerSize(conf, ppConfig.getKmerSize());
        job.setInputFormatClass(FastaKmerInputFormat.class);
        job.setMapOutputKeyClass(CompressedSequenceWritable.class);
        job.setMapOutputValueClass(IntWritable.class);

        // Combiner
        job.setCombinerClass(KmerIndexBuilderCombiner.class);

        // Partitioner
        job.setPartitionerClass(KmerIndexBuilderPartitioner.class);

        // Reducer
        job.setReducerClass(KmerIndexBuilderReducer.class);

        // Specify key / value
        job.setOutputKeyClass(CompressedSequenceWritable.class);
        job.setOutputValueClass(IntWritable.class);

        // Inputs
        FileInputFormat.addInputPaths(job, roundInputFile.toString());

        LOG.info("Input file : ");
        LOG.info("> " + roundInputFile.toString());

        String histogramFileName = KmerHistogramHelper.makeKmerHistogramFileName(roundInputFile.getName());
        Path histogramPath = new Path(ppConfig.getKmerHistogramPath(), histogramFileName);

        KmerIndexBuilderPartitioner.setHistogramPath(job.getConfiguration(), histogramPath);

        FileOutputFormat.setOutputPath(job, new Path(roundOutputPath));
        job.setOutputFormatClass(MapFileOutputFormat.class);

        // Use many reducers
        int reducers = conf.getInt("mapred.reduce.tasks", 0);
        if (reducers <= 0) {
            int MRNodes = MapReduceClusterHelper.getNodeNum(conf);
            reducers = MRNodes * 2;
            job.setNumReduceTasks(reducers);
        }
        LOG.info("Reducers : " + reducers);

        // Execute job and return status
        boolean result = job.waitForCompletion(true);

        jobs.add(job);

        // commit results
        if (result) {
            commitRoundIndexOutputFiles(roundInputFile, new Path(roundOutputPath),
                    new Path(ppConfig.getKmerIndexPath()), job.getConfiguration(), ppConfig.getKmerSize());

            // create index of index
            createIndexOfIndex(new Path(ppConfig.getKmerIndexPath()), roundInputFile, job.getConfiguration(),
                    ppConfig.getKmerSize());

            // create statistics of index
            createStatisticsOfIndex(new Path(ppConfig.getKmerStatisticsPath()), roundInputFile,
                    job.getConfiguration(), job.getCounters(), ppConfig.getKmerSize());
        }

        if (!result) {
            LOG.error("job failed at round " + round + " of " + inputFiles.length);
            job_result = false;
            break;
        }
    }

    // report
    if (ppConfig.getReportPath() != null && !ppConfig.getReportPath().isEmpty()) {
        Report report = new Report();
        report.addJob(jobs);
        report.writeTo(ppConfig.getReportPath());
    }

    return job_result ? 0 : 1;
}