Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:it.polito.dbdmg.searum.ARM.java

License:Apache License

/**
 * Run the rule aggregator job over mined rules.
 * // w ww  . j  a v a  2 s.c o  m
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static void startRuleAggregating(Parameters params, Configuration conf)
        throws IOException, ClassNotFoundException, InterruptedException {
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    Path input = new Path(params.get(OUTPUT), RULES);
    Job job = new Job(conf, "Rule aggregator driver running over input: " + input);
    job.setJarByClass(ARM.class);
    FileInputFormat.addInputPath(job, input);
    Path outPath = new Path(params.get(OUTPUT), RULESBYCONCLUSION);
    FileOutputFormat.setOutputPath(job, outPath);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(RuleAggregatorMapper.class);
    job.setReducerClass(RuleAggregatorReducer.class);
    job.setPartitionerClass(RulePartitionerByConclusion.class);
    job.setSortComparatorClass(RulesWritableComparator.class);
    job.setGroupingComparatorClass(RulesGroupingWritableComparator.class);

    HadoopUtil.delete(conf, outPath);
    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:ivory.core.index.BuildIPInvertedIndexDocSorted.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);
    int minSplitSize = conf.getInt(Constants.MinSplitSize, 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    String postingsType = conf.get(Constants.PostingsListsType,
            ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName());
    @SuppressWarnings("unchecked")
    Class<? extends PostingsList> postingsClass = (Class<? extends PostingsList>) Class.forName(postingsType);

    LOG.info("PowerTool: " + BuildIPInvertedIndexDocSorted.class.getCanonicalName());
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCnt));
    LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName()));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));
    LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize));

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }//from   w ww. j a v a 2s .  c om

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    Job job = new Job(conf, BuildIPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName);
    job.setJarByClass(BuildIPInvertedIndexDocSorted.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, postingsPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(PairOfInts.class);
    job.setMapOutputValueClass(TermPositions.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(postingsClass);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType(postingsClass.getCanonicalName());

    return 0;
}

From source file:kogiri.mapreduce.libra.kmersimilarity_r.KmerSimilarityReduce.java

License:Open Source License

private int runJob(LibraConfig lConfig) throws Exception {
    // check config
    validateLibraConfig(lConfig);/*from  ww  w.  j  a  v  a2s .c o  m*/

    // configuration
    Configuration conf = this.getConf();

    Job job = new Job(conf, "Kogiri Libra - Computing similarity between samples");
    conf = job.getConfiguration();

    // set user configuration
    lConfig.getClusterConfiguration().configureTo(conf);
    lConfig.saveTo(conf);

    job.setJarByClass(KmerSimilarityReduce.class);

    // Mapper
    job.setMapperClass(KmerSimilarityMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(CompressedSequenceWritable.class);
    job.setMapOutputValueClass(CompressedIntArrayWritable.class);

    // Partitioner
    job.setPartitionerClass(KmerSimilarityPartitioner.class);

    // Reducer
    job.setReducerClass(KmerSimilarityReducer.class);

    // Specify key / value
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Inputs
    Path[] kmerIndexFiles = KmerIndexHelper.getAllKmerIndexIndexFilePath(conf, lConfig.getKmerIndexPath());
    List<Path> indexPartFileArray = new ArrayList<Path>();
    for (Path kmerIndexFile : kmerIndexFiles) {
        Path[] inputKmerIndexPartFiles = KmerIndexHelper.getKmerIndexPartFilePath(conf, kmerIndexFile);
        for (Path indexPartFile : inputKmerIndexPartFiles) {
            Path[] kmerIndexPartDataFiles = KmerIndexHelper.getAllKmerIndexPartDataFilePath(conf,
                    indexPartFile);
            for (Path kmerIndexPartDataFile : kmerIndexPartDataFiles) {
                indexPartFileArray.add(kmerIndexPartDataFile);
            }
        }
    }

    SequenceFileInputFormat.addInputPaths(job,
            FileSystemHelper.makeCommaSeparated(indexPartFileArray.toArray(new Path[0])));

    LOG.info("Input kmer index files : " + kmerIndexFiles.length);
    for (Path inputFile : kmerIndexFiles) {
        LOG.info("> " + inputFile.toString());
    }

    int kmerSize = 0;
    for (Path inputFile : kmerIndexFiles) {
        // check kmerSize
        int myKmerSize = KmerIndexHelper.getKmerSize(inputFile);
        if (kmerSize == 0) {
            kmerSize = myKmerSize;
        } else {
            if (kmerSize != myKmerSize) {
                throw new Exception("kmer size must be the same over all given kmer indices");
            }
        }
    }

    KmerMatchFileMapping fileMapping = new KmerMatchFileMapping();
    for (Path kmerIndexFile : kmerIndexFiles) {
        String fastaFilename = KmerIndexHelper.getFastaFileName(kmerIndexFile.getName());
        fileMapping.addFastaFile(fastaFilename);
    }
    fileMapping.saveTo(conf);

    int MRNodes = MapReduceClusterHelper.getNodeNum(conf);

    LOG.info("MapReduce nodes detected : " + MRNodes);

    FileOutputFormat.setOutputPath(job, new Path(lConfig.getOutputPath()));
    job.setOutputFormatClass(TextOutputFormat.class);

    // Reducer
    // Use many reducers
    int reducersPerNode = lConfig.getClusterConfiguration().getMachineCores() / 2;
    if (reducersPerNode < 1) {
        reducersPerNode = 1;
    }
    int reducers = lConfig.getClusterConfiguration().getMachineNum()
            * (lConfig.getClusterConfiguration().getMachineCores() / 2);
    LOG.info("Reducers : " + reducers);
    job.setNumReduceTasks(reducers);

    // Execute job and return status
    boolean result = job.waitForCompletion(true);

    // commit results
    if (result) {
        commit(new Path(lConfig.getOutputPath()), conf);

        Path tableFilePath = new Path(lConfig.getOutputPath(),
                KmerSimilarityHelper.makeKmerSimilarityTableFileName());
        FileSystem fs = tableFilePath.getFileSystem(conf);
        fileMapping.saveTo(fs, tableFilePath);

        // combine results
        sumScores(new Path(lConfig.getOutputPath()), conf);
    }

    // report
    if (lConfig.getReportPath() != null && !lConfig.getReportPath().isEmpty()) {
        Report report = new Report();
        report.addJob(job);
        report.writeTo(lConfig.getReportPath());
    }

    return result ? 0 : 1;
}

From source file:kogiri.mapreduce.preprocess.indexing.stage2.KmerIndexBuilder.java

License:Open Source License

private int runJob(PreprocessorConfig ppConfig) throws Exception {
    // check config
    validatePreprocessorConfig(ppConfig);

    // configuration
    Configuration conf = this.getConf();

    // set user configuration
    ppConfig.getClusterConfiguration().configureTo(conf);
    ppConfig.saveTo(conf);/*  w  ww  .j  a va 2  s  .c o m*/

    Path[] inputFiles = FileSystemHelper.getAllFastaFilePath(conf, ppConfig.getFastaPath());

    boolean job_result = true;
    List<Job> jobs = new ArrayList<Job>();

    for (int round = 0; round < inputFiles.length; round++) {
        Path roundInputFile = inputFiles[round];
        String roundOutputPath = ppConfig.getKmerIndexPath() + "_round" + round;

        Job job = new Job(conf,
                "Kogiri Preprocessor - Building Kmer Indices (" + round + " of " + inputFiles.length + ")");
        job.setJarByClass(KmerIndexBuilder.class);

        // Mapper
        job.setMapperClass(KmerIndexBuilderMapper.class);
        job.setInputFormatClass(FastaReadInputFormat.class);
        job.setMapOutputKeyClass(CompressedSequenceWritable.class);
        job.setMapOutputValueClass(CompressedIntArrayWritable.class);

        // Combiner
        job.setCombinerClass(KmerIndexBuilderCombiner.class);

        // Partitioner
        job.setPartitionerClass(KmerIndexBuilderPartitioner.class);

        // Reducer
        job.setReducerClass(KmerIndexBuilderReducer.class);

        // Specify key / value
        job.setOutputKeyClass(CompressedSequenceWritable.class);
        job.setOutputValueClass(CompressedIntArrayWritable.class);

        // Inputs
        FileInputFormat.addInputPaths(job, roundInputFile.toString());

        LOG.info("Input file : ");
        LOG.info("> " + roundInputFile.toString());

        String histogramFileName = KmerHistogramHelper.makeKmerHistogramFileName(roundInputFile.getName());
        Path histogramPath = new Path(ppConfig.getKmerHistogramPath(), histogramFileName);

        KmerIndexBuilderPartitioner.setHistogramPath(job.getConfiguration(), histogramPath);

        FileOutputFormat.setOutputPath(job, new Path(roundOutputPath));
        job.setOutputFormatClass(MapFileOutputFormat.class);

        // Use many reducers
        int reducersPerNode = ppConfig.getClusterConfiguration().getMachineCores() / 2;
        if (reducersPerNode < 1) {
            reducersPerNode = 1;
        }
        int reducers = ppConfig.getClusterConfiguration().getMachineNum()
                * (ppConfig.getClusterConfiguration().getMachineCores() / 2);
        LOG.info("Reducers : " + reducers);
        job.setNumReduceTasks(reducers);

        // Execute job and return status
        boolean result = job.waitForCompletion(true);

        jobs.add(job);

        // commit results
        if (result) {
            commitRoundIndexOutputFiles(roundInputFile, new Path(roundOutputPath),
                    new Path(ppConfig.getKmerIndexPath()), job.getConfiguration(), ppConfig.getKmerSize());

            // create index of index
            createIndexOfIndex(new Path(ppConfig.getKmerIndexPath()), roundInputFile, job.getConfiguration(),
                    ppConfig.getKmerSize());
        }

        if (!result) {
            LOG.error("job failed at round " + round + " of " + inputFiles.length);
            job_result = false;
            break;
        }
    }

    // report
    if (ppConfig.getReportPath() != null && !ppConfig.getReportPath().isEmpty()) {
        Report report = new Report();
        report.addJob(jobs);
        report.writeTo(ppConfig.getReportPath());
    }

    return job_result ? 0 : 1;
}

From source file:kogiri.mapreduce.readfrequency.modecount.ModeCounter.java

License:Open Source License

private int runJob(ReadFrequencyCounterConfig rfConfig) throws Exception {
    // check config
    validateReadFrequencyCounterConfig(rfConfig);

    // configuration
    Configuration conf = this.getConf();

    // set user configuration
    rfConfig.getClusterConfiguration().configureTo(conf);
    rfConfig.saveTo(conf);/*w ww  .  j  a v a  2  s . c o  m*/

    // table file
    Path tableFilePath = new Path(rfConfig.getKmerMatchPath(), KmerMatchHelper.makeKmerMatchTableFileName());
    FileSystem fs = tableFilePath.getFileSystem(conf);
    KmerMatchFileMapping fileMapping = KmerMatchFileMapping.createInstance(fs, tableFilePath);

    Path[] inputFiles = KmerMatchHelper.getAllKmerMatchResultFilePath(conf, rfConfig.getKmerMatchPath());

    // Register named outputs
    NamedOutputs namedOutputs = new NamedOutputs();
    for (int i = 0; i < fileMapping.getSize(); i++) {
        String fastaFileName = fileMapping.getFastaFileFromID(i);
        namedOutputs.add(fastaFileName);
    }
    namedOutputs.saveTo(conf);

    boolean job_result = true;
    List<Job> jobs = new ArrayList<Job>();

    for (int round = 0; round < fileMapping.getSize(); round++) {
        String roundOutputPath = rfConfig.getReadFrequencyPath() + "_round" + round;

        Job job = new Job(conf, "Kogiri Preprocessor - Computing Mode of Kmer Frequency (" + round + " of "
                + fileMapping.getSize() + ")");
        job.setJarByClass(ModeCounter.class);

        // Mapper
        job.setMapperClass(ModeCounterMapper.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(MultiFileIntWritable.class);
        job.setMapOutputValueClass(CompressedIntArrayWritable.class);

        // Combiner
        job.setCombinerClass(ModeCounterCombiner.class);

        // Partitioner
        job.setPartitionerClass(ModeCounterPartitioner.class);

        // Reducer
        job.setReducerClass(ModeCounterReducer.class);

        // Specify key / value
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // Inputs
        FileInputFormat.addInputPaths(job, FileSystemHelper.makeCommaSeparated(inputFiles));

        ModeCounterConfig modeCounterConfig = new ModeCounterConfig();
        modeCounterConfig.setMasterFileID(round);
        modeCounterConfig.saveTo(job.getConfiguration());

        FileOutputFormat.setOutputPath(job, new Path(roundOutputPath));
        job.setOutputFormatClass(TextOutputFormat.class);

        for (NamedOutputRecord namedOutput : namedOutputs.getRecord()) {
            MultipleOutputs.addNamedOutput(job, namedOutput.getIdentifier(), TextOutputFormat.class, Text.class,
                    Text.class);
        }

        // Execute job and return status
        boolean result = job.waitForCompletion(true);

        jobs.add(job);

        // commit results
        if (result) {
            commitRoundOutputFiles(new Path(roundOutputPath), new Path(rfConfig.getReadFrequencyPath()),
                    job.getConfiguration(), namedOutputs, round);
        }

        if (!result) {
            LOG.error("job failed at round " + round + " of " + fileMapping.getSize());
            job_result = false;
            break;
        }
    }

    // report
    if (rfConfig.getReportPath() != null && !rfConfig.getReportPath().isEmpty()) {
        Report report = new Report();
        report.addJob(jobs);
        report.writeTo(rfConfig.getReportPath());
    }

    return job_result ? 0 : 1;
}

From source file:layer.AutoCoder.java

License:Apache License

/**
 * Runs this tool.//w  w  w .  j  a  v a2 s .co  m
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath0 = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + AutoCoder.class.getSimpleName());
    LOG.info(" - input path: " + inputPath0);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);

    Configuration conf = getConf();
    initialParameters(conf);

    for (int iterations = 1; iterations < GlobalUtil.NUM_LAYER + 1; iterations++) {
        LOG.info("** Layer: " + iterations);
        try {

            Job job = Job.getInstance(conf);
            job.setJobName(AutoCoder.class.getSimpleName());
            job.setJarByClass(AutoCoder.class);
            // set the path of the information of k clusters in this iteration
            job.getConfiguration().set("sidepath", inputPath0 + "/side_output");
            job.getConfiguration().setInt("layer_ind", iterations);
            job.setNumReduceTasks(reduceTasks);

            String inputPath = inputPath0 + "/train";
            dataShuffle();

            FileInputFormat.setInputPaths(job, new Path(inputPath));
            FileOutputFormat.setOutputPath(job, new Path(outputPath));

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(ModelNode.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(ModelNode.class);

            job.setMapperClass(MyMapper.class);
            job.setReducerClass(MyReducer.class);
            job.setPartitionerClass(MyPartitioner.class);

            // Delete the output directory if it exists already.
            Path outputDir = new Path(outputPath);
            FileSystem.get(getConf()).delete(outputDir, true);

            long startTime = System.currentTimeMillis();
            job.waitForCompletion(true);
            LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

            prepareNextIteration(inputPath0, outputPath, iterations, conf, reduceTasks);
        } catch (Exception exp) {
            exp.printStackTrace();
        }
    }

    return 0;
}

From source file:ldbc.snb.datagen.hadoop.HadoopPersonActivityGenerator.java

public void run(String inputFileName) throws AssertionError, Exception {

    FileSystem fs = FileSystem.get(conf);

    System.out.println("RANKING");
    String rankedFileName = conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/ranked";
    HadoopFileRanker hadoopFileRanker = new HadoopFileRanker(conf, TupleKey.class, Person.class, null);
    hadoopFileRanker.run(inputFileName, rankedFileName);

    System.out.println("GENERATING");
    int numThreads = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.numThreads"));
    Job job = Job.getInstance(conf, "Person Activity Generator/Serializer");
    job.setMapOutputKeyClass(BlockKey.class);
    job.setMapOutputValueClass(Person.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Person.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(HadoopPersonActivityGeneratorReducer.class);
    job.setNumReduceTasks(numThreads);/*from w w  w.  j a  v  a2 s. co m*/
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setSortComparatorClass(BlockKeyComparator.class);
    job.setGroupingComparatorClass(BlockKeyGroupComparator.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);

    /** PROFILING OPTIONS **/
    //job.setProfileEnabled(true);
    //job.setProfileParams("-agentlib:hprof=cpu=samples,heap=sites,depth=4,thread=y,format=b,file=%s");
    //job.setProfileTaskRange(true,"0-1");
    //job.setProfileTaskRange(false,"0-1");
    /****/

    FileInputFormat.setInputPaths(job, new Path(rankedFileName));
    FileOutputFormat.setOutputPath(job, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/aux"));
    long start = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            throw new Exception();
        }
    } catch (AssertionError e) {
        throw e;
    }
    System.out.println("Real time to generate activity: " + (System.currentTimeMillis() - start) / 1000.0f);

    try {
        fs.delete(new Path(rankedFileName), true);
        fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/aux"), true);
    } catch (IOException e) {
        System.err.println(e.getMessage());
        e.printStackTrace();
    }
}

From source file:ldbc.socialnet.dbgen.generator.MRGenerateUsers.java

License:Open Source License

public int runGenerateJob(Configuration conf) throws Exception {
    FileSystem fs = FileSystem.get(conf);
    String hadoopDir = new String(conf.get("outputDir") + "/hadoop");
    String socialNetDir = new String(conf.get("outputDir") + "/social_network");
    int numThreads = Integer.parseInt(conf.get("numThreads"));
    System.out.println("NUMBER OF THREADS " + numThreads);

    /// --------- Execute Jobs ------
    long start = System.currentTimeMillis();

    /// --------------- First job Generating users----------------
    printProgress("Starting: Person generation");
    conf.set("pass", Integer.toString(0));
    Job job = new Job(conf, "SIB Generate Users & 1st Dimension");
    job.setMapOutputKeyClass(TupleKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(GenerateUsersMapper.class);
    job.setMapperClass(GenerateUsersMapper.class);
    job.setNumReduceTasks(numThreads);/*from   w w w .  j ava  2  s. c o m*/
    job.setInputFormatClass(NLineInputFormat.class);
    conf.setInt("mapred.line.input.format.linespermap", 1);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, new Path(hadoopDir) + "/mrInputFile");
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib"));
    job.waitForCompletion(true);

    /// --------------- Sorting by first dimension  ----------------
    printProgress("Starting: Sorting by first dimension");
    HadoopFileRanker fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class);
    fileRanker.run(hadoopDir + "/sib", hadoopDir + "/sibSorting");
    fs.delete(new Path(hadoopDir + "/sib"), true);

    /// --------------- job Generating First dimension Friendships  ----------------
    printProgress("Starting: Friendship generation 1.");
    conf.set("pass", Integer.toString(0));
    conf.set("dimension", Integer.toString(1));
    job = new Job(conf, "SIB Generate Friendship - Interest");
    job.setMapOutputKeyClass(ComposedKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(DimensionReducer.class);
    job.setNumReduceTasks(numThreads);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);
    job.setSortComparatorClass(ComposedKeyComparator.class);
    job.setGroupingComparatorClass(ComposedKeyGroupComparator.class);

    FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting"));
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib2"));
    job.waitForCompletion(true);
    fs.delete(new Path(hadoopDir + "/sibSorting"), true);

    /// --------------- Sorting phase 2  ----------------
    printProgress("Starting: Sorting by second dimension");
    fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class);
    fileRanker.run(hadoopDir + "/sib2", hadoopDir + "/sibSorting2");
    fs.delete(new Path(hadoopDir + "/sib2"), true);

    /// --------------- Second job Generating Friendships  ----------------
    printProgress("Starting: Friendship generation 2.");
    conf.set("pass", Integer.toString(1));
    conf.set("dimension", Integer.toString(2));
    job = new Job(conf, "SIB Generate Friendship - Interest");
    job.setMapOutputKeyClass(ComposedKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(DimensionReducer.class);
    job.setNumReduceTasks(numThreads);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);
    job.setSortComparatorClass(ComposedKeyComparator.class);
    job.setGroupingComparatorClass(ComposedKeyGroupComparator.class);
    FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting2"));
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib3"));
    job.waitForCompletion(true);
    fs.delete(new Path(hadoopDir + "/sibSorting2"), true);

    /// --------------- Sorting phase 3--------------
    printProgress("Starting: Sorting by third dimension");
    fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class);
    fileRanker.run(hadoopDir + "/sib3", hadoopDir + "/sibSorting3");
    fs.delete(new Path(hadoopDir + "/sib3"), true);

    /// --------------- Third job Generating Friendships----------------
    printProgress("Starting: Friendship generation 3.");
    conf.set("pass", Integer.toString(2));
    conf.set("dimension", Integer.toString(2));
    job = new Job(conf, "SIB Generate Friendship - Random");
    job.setMapOutputKeyClass(ComposedKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(DimensionReducer.class);
    job.setNumReduceTasks(numThreads);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);
    job.setSortComparatorClass(ComposedKeyComparator.class);
    job.setGroupingComparatorClass(ComposedKeyGroupComparator.class);
    FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting3"));
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib4"));
    job.waitForCompletion(true);
    fs.delete(new Path(hadoopDir + "/sibSorting3"), true);

    /// --------------- Sorting phase 3--------------

    printProgress("Starting: Sorting by third dimension (for activity generation)");
    fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class);
    fileRanker.run(hadoopDir + "/sib4", hadoopDir + "/sibSorting4");
    fs.delete(new Path(hadoopDir + "/sib4"), true);

    /// --------------- Fourth job: Serialize static network ----------------

    printProgress("Starting: Generating person activity");
    job = new Job(conf, "Generate user activity");
    job.setMapOutputKeyClass(ComposedKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(UserActivityReducer.class);
    job.setNumReduceTasks(numThreads);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);
    job.setSortComparatorClass(ComposedKeyComparator.class);
    job.setGroupingComparatorClass(ComposedKeyGroupComparator.class);
    FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting4"));
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib5"));
    job.waitForCompletion(true);
    fs.delete(new Path(hadoopDir + "/sib5"), true);

    int numEvents = 0;
    long min = Long.MAX_VALUE;
    long max = Long.MIN_VALUE;

    if (conf.getBoolean("updateStreams", false)) {
        for (int i = 0; i < numThreads; ++i) {
            int numPartitions = conf.getInt("numUpdatePartitions", 1);
            for (int j = 0; j < numPartitions; ++j) {
                /// --------------- Fifth job: Sort update streams ----------------
                conf.setInt("mapred.line.input.format.linespermap", 1000000);
                conf.setInt("reducerId", i);
                conf.setInt("partitionId", j);
                conf.set("streamType", "forum");
                Job jobForum = new Job(conf, "Soring update streams " + j + " of reducer " + i);
                jobForum.setMapOutputKeyClass(LongWritable.class);
                jobForum.setMapOutputValueClass(Text.class);
                jobForum.setOutputKeyClass(LongWritable.class);
                jobForum.setOutputValueClass(Text.class);
                jobForum.setJarByClass(UpdateEventMapper.class);
                jobForum.setMapperClass(UpdateEventMapper.class);
                jobForum.setReducerClass(UpdateEventReducer.class);
                jobForum.setNumReduceTasks(1);
                jobForum.setInputFormatClass(SequenceFileInputFormat.class);
                jobForum.setOutputFormatClass(SequenceFileOutputFormat.class);
                jobForum.setPartitionerClass(UpdateEventPartitioner.class);
                FileInputFormat.addInputPath(jobForum,
                        new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_forum"));
                FileOutputFormat.setOutputPath(jobForum, new Path(hadoopDir + "/sibEnd"));
                printProgress("Starting: Sorting update streams");
                jobForum.waitForCompletion(true);
                fs.delete(new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_forum"), false);
                fs.delete(new Path(hadoopDir + "/sibEnd"), true);

                conf.setInt("mapred.line.input.format.linespermap", 1000000);
                conf.setInt("reducerId", i);
                conf.setInt("partitionId", j);
                conf.set("streamType", "person");
                Job jobPerson = new Job(conf, "Soring update streams " + j + " of reducer " + i);
                jobPerson.setMapOutputKeyClass(LongWritable.class);
                jobPerson.setMapOutputValueClass(Text.class);
                jobPerson.setOutputKeyClass(LongWritable.class);
                jobPerson.setOutputValueClass(Text.class);
                jobPerson.setJarByClass(UpdateEventMapper.class);
                jobPerson.setMapperClass(UpdateEventMapper.class);
                jobPerson.setReducerClass(UpdateEventReducer.class);
                jobPerson.setNumReduceTasks(1);
                jobPerson.setInputFormatClass(SequenceFileInputFormat.class);
                jobPerson.setOutputFormatClass(SequenceFileOutputFormat.class);
                jobPerson.setPartitionerClass(UpdateEventPartitioner.class);
                FileInputFormat.addInputPath(jobPerson,
                        new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_person"));
                FileOutputFormat.setOutputPath(jobPerson, new Path(hadoopDir + "/sibEnd"));
                printProgress("Starting: Sorting update streams");
                jobPerson.waitForCompletion(true);
                fs.delete(new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_person"), false);
                fs.delete(new Path(hadoopDir + "/sibEnd"), true);

                if (conf.getBoolean("updateStreams", false)) {
                    Properties properties = new Properties();
                    FSDataInputStream file = fs.open(new Path(conf.get("outputDir")
                            + "/social_network/updateStream_" + i + "_" + j + "_person.properties"));
                    properties.load(file);
                    if (properties.getProperty("min_write_event_start_time") != null) {
                        Long auxMin = Long.parseLong(properties.getProperty("min_write_event_start_time"));
                        min = auxMin < min ? auxMin : min;
                        Long auxMax = Long.parseLong(properties.getProperty("max_write_event_start_time"));
                        max = auxMax > max ? auxMax : max;
                        numEvents += Long.parseLong(properties.getProperty("num_events"));
                    }
                    file.close();
                    file = fs.open(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_"
                            + j + "_forum.properties"));
                    properties.load(file);
                    if (properties.getProperty("min_write_event_start_time") != null) {
                        Long auxMin = Long.parseLong(properties.getProperty("min_write_event_start_time"));
                        min = auxMin < min ? auxMin : min;
                        Long auxMax = Long.parseLong(properties.getProperty("max_write_event_start_time"));
                        max = auxMax > max ? auxMax : max;
                        numEvents += Long.parseLong(properties.getProperty("num_events"));
                    }
                    file.close();
                    fs.delete(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j
                            + "_person.properties"), true);
                    fs.delete(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j
                            + "_forum.properties"), true);
                }
            }
        }

        if (conf.getBoolean("updateStreams", false)) {
            OutputStream output = fs
                    .create(new Path(conf.get("outputDir") + "/social_network/updateStream.properties"));
            output.write(new String("ldbc.snb.interactive.gct_delta_duration:" + conf.get("deltaTime") + "\n")
                    .getBytes());
            output.write(
                    new String("ldbc.snb.interactive.min_write_event_start_time:" + min + "\n").getBytes());
            output.write(
                    new String("ldbc.snb.interactive.max_write_event_start_time:" + max + "\n").getBytes());
            output.write(new String("ldbc.snb.interactive.update_interleave:" + (max - min) / numEvents + "\n")
                    .getBytes());
            output.write(new String("ldbc.snb.interactive.num_events:" + numEvents).getBytes());
            output.close();
        }
    }

    /// --------------- Sixth job: Materialize the friends lists ----------------
    /*        Job job6 = new Job(conf,"Dump the friends lists");
            job6.setMapOutputKeyClass(ComposedKey.class);
            job6.setMapOutputValueClass(ReducedUserProfile.class);
            job6.setOutputKeyClass(ComposedKey.class);
            job6.setOutputValueClass(ReducedUserProfile.class);
            job6.setJarByClass(HadoopBlockMapper.class);
            job6.setMapperClass(HadoopBlockMapper.class);
            job6.setReducerClass(FriendListOutputReducer.class);
            job6.setNumReduceTasks(numThreads);
            job6.setInputFormatClass(SequenceFileInputFormat.class);
            job6.setOutputFormatClass(SequenceFileOutputFormat.class);
            job6.setPartitionerClass(HadoopBlockPartitioner.class);
            job6.setSortComparatorClass(ComposedKeyComparator.class);
            job6.setGroupingComparatorClass(ComposedKeyGroupComparator.class);
            FileInputFormat.setInputPaths(job6, new Path(hadoopDir + "/sibSorting4"));
            FileOutputFormat.setOutputPath(job6, new Path(hadoopDir + "/job6") );
            
            
            printProgress("Starting: Materialize friends for substitution parameters");
            int resMaterializeFriends = job6.waitForCompletion(true) ? 0 : 1;
            fs.delete(new Path(hadoopDir + "/sibSorting3"),true);
            */

    long end = System.currentTimeMillis();
    System.out.println(((end - start) / 1000) + " total seconds");
    for (int i = 0; i < numThreads; ++i) {
        fs.copyToLocalFile(new Path(socialNetDir + "/m" + i + "factors.txt"), new Path("./"));
        fs.copyToLocalFile(new Path(socialNetDir + "/m0friendList" + i + ".csv"), new Path("./"));
    }
    return 0;
}

From source file:libra.core.kmersimilarity_r.KmerSimilarityReduce.java

License:Apache License

private int runJob(CoreConfig cConfig) throws Exception {
    // check config
    validateCoreConfig(cConfig);/*w  ww.  j  av a 2s . co m*/

    // configuration
    Configuration conf = this.getConf();

    Job job = new Job(conf, "Libra Core - Computing similarity between samples");
    conf = job.getConfiguration();

    // set user configuration
    cConfig.saveTo(conf);

    job.setJarByClass(KmerSimilarityReduce.class);

    // Mapper
    job.setMapperClass(KmerSimilarityMapper.class);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapOutputKeyClass(CompressedSequenceWritable.class);
    job.setMapOutputValueClass(CompressedIntArrayWritable.class);

    // Partitioner
    job.setPartitionerClass(KmerSimilarityPartitioner.class);

    // Reducer
    job.setReducerClass(KmerSimilarityReducer.class);

    // Specify key / value
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Inputs
    Path[] kmerIndexFiles = KmerIndexHelper.getAllKmerIndexIndexFilePath(conf, cConfig.getKmerIndexPath());
    List<Path> indexPartFileArray = new ArrayList<Path>();
    for (Path kmerIndexFile : kmerIndexFiles) {
        Path[] inputKmerIndexPartFiles = KmerIndexHelper.getKmerIndexPartFilePath(conf, kmerIndexFile);
        for (Path indexPartFile : inputKmerIndexPartFiles) {
            Path[] kmerIndexPartDataFiles = KmerIndexHelper.getAllKmerIndexPartDataFilePath(conf,
                    indexPartFile);
            for (Path kmerIndexPartDataFile : kmerIndexPartDataFiles) {
                indexPartFileArray.add(kmerIndexPartDataFile);
            }
        }
    }

    SequenceFileInputFormat.addInputPaths(job,
            FileSystemHelper.makeCommaSeparated(indexPartFileArray.toArray(new Path[0])));

    LOG.info("Input kmer index files : " + kmerIndexFiles.length);
    for (Path inputFile : kmerIndexFiles) {
        LOG.info("> " + inputFile.toString());
    }

    int kmerSize = 0;
    for (Path inputFile : kmerIndexFiles) {
        // check kmerSize
        int myKmerSize = KmerIndexHelper.getKmerSize(inputFile);
        if (kmerSize == 0) {
            kmerSize = myKmerSize;
        } else {
            if (kmerSize != myKmerSize) {
                throw new Exception("kmer size must be the same over all given kmer indices");
            }
        }
    }

    KmerMatchFileMapping fileMapping = new KmerMatchFileMapping();
    for (Path kmerIndexFile : kmerIndexFiles) {
        String fastaFilename = KmerIndexHelper.getFastaFileName(kmerIndexFile.getName());
        fileMapping.addFastaFile(fastaFilename);
    }
    fileMapping.saveTo(conf);

    FileOutputFormat.setOutputPath(job, new Path(cConfig.getOutputPath()));
    job.setOutputFormatClass(TextOutputFormat.class);

    // Reducer
    // Use many reducers
    int reducers = conf.getInt("mapred.reduce.tasks", 0);
    if (reducers <= 0) {
        int MRNodes = MapReduceClusterHelper.getNodeNum(conf);
        reducers = MRNodes * 2;
        job.setNumReduceTasks(reducers);
    }
    LOG.info("Reducers : " + reducers);

    // Execute job and return status
    boolean result = job.waitForCompletion(true);

    // commit results
    if (result) {
        commit(new Path(cConfig.getOutputPath()), conf);

        Path tableFilePath = new Path(cConfig.getOutputPath(),
                KmerSimilarityHelper.makeKmerSimilarityTableFileName());
        FileSystem fs = tableFilePath.getFileSystem(conf);
        fileMapping.saveTo(fs, tableFilePath);

        // combine results
        sumScores(new Path(cConfig.getOutputPath()), conf);
    }

    // report
    if (cConfig.getReportPath() != null && !cConfig.getReportPath().isEmpty()) {
        Report report = new Report();
        report.addJob(job);
        report.writeTo(cConfig.getReportPath());
    }

    return result ? 0 : 1;
}

From source file:libra.preprocess.stage2.KmerIndexBuilder.java

License:Apache License

private int runJob(PreprocessorConfig ppConfig) throws Exception {
    // check config
    validatePreprocessorConfig(ppConfig);

    // configuration
    Configuration conf = this.getConf();

    // set user configuration
    ppConfig.saveTo(conf);//from   www.  ja  v a2 s . c o m

    Path[] inputFiles = FileSystemHelper.getAllFastaFilePath(conf, ppConfig.getFastaPath());

    boolean job_result = true;
    List<Job> jobs = new ArrayList<Job>();

    for (int round = 0; round < inputFiles.length; round++) {
        Path roundInputFile = inputFiles[round];
        String roundOutputPath = ppConfig.getKmerIndexPath() + "_round" + round;

        Job job = new Job(conf,
                "Libra Preprocessor - Building Kmer Indexes (" + round + " of " + inputFiles.length + ")");
        job.setJarByClass(KmerIndexBuilder.class);

        // Mapper
        job.setMapperClass(KmerIndexBuilderMapper.class);
        FastaKmerInputFormat.setKmerSize(conf, ppConfig.getKmerSize());
        job.setInputFormatClass(FastaKmerInputFormat.class);
        job.setMapOutputKeyClass(CompressedSequenceWritable.class);
        job.setMapOutputValueClass(IntWritable.class);

        // Combiner
        job.setCombinerClass(KmerIndexBuilderCombiner.class);

        // Partitioner
        job.setPartitionerClass(KmerIndexBuilderPartitioner.class);

        // Reducer
        job.setReducerClass(KmerIndexBuilderReducer.class);

        // Specify key / value
        job.setOutputKeyClass(CompressedSequenceWritable.class);
        job.setOutputValueClass(IntWritable.class);

        // Inputs
        FileInputFormat.addInputPaths(job, roundInputFile.toString());

        LOG.info("Input file : ");
        LOG.info("> " + roundInputFile.toString());

        String histogramFileName = KmerHistogramHelper.makeKmerHistogramFileName(roundInputFile.getName());
        Path histogramPath = new Path(ppConfig.getKmerHistogramPath(), histogramFileName);

        KmerIndexBuilderPartitioner.setHistogramPath(job.getConfiguration(), histogramPath);

        FileOutputFormat.setOutputPath(job, new Path(roundOutputPath));
        job.setOutputFormatClass(MapFileOutputFormat.class);

        // Use many reducers
        int reducers = conf.getInt("mapred.reduce.tasks", 0);
        if (reducers <= 0) {
            int MRNodes = MapReduceClusterHelper.getNodeNum(conf);
            reducers = MRNodes * 2;
            job.setNumReduceTasks(reducers);
        }
        LOG.info("Reducers : " + reducers);

        // Execute job and return status
        boolean result = job.waitForCompletion(true);

        jobs.add(job);

        // commit results
        if (result) {
            commitRoundIndexOutputFiles(roundInputFile, new Path(roundOutputPath),
                    new Path(ppConfig.getKmerIndexPath()), job.getConfiguration(), ppConfig.getKmerSize());

            // create index of index
            createIndexOfIndex(new Path(ppConfig.getKmerIndexPath()), roundInputFile, job.getConfiguration(),
                    ppConfig.getKmerSize());

            // create statistics of index
            createStatisticsOfIndex(new Path(ppConfig.getKmerStatisticsPath()), roundInputFile,
                    job.getConfiguration(), job.getCounters(), ppConfig.getKmerSize());
        }

        if (!result) {
            LOG.error("job failed at round " + round + " of " + inputFiles.length);
            job_result = false;
            break;
        }
    }

    // report
    if (ppConfig.getReportPath() != null && !ppConfig.getReportPath().isEmpty()) {
        Report report = new Report();
        report.addJob(jobs);
        report.writeTo(ppConfig.getReportPath());
    }

    return job_result ? 0 : 1;
}