Example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setGroupingComparatorClass.

Prototype

public void setGroupingComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException

Source Link

Document

Define the comparator that controls which keys are grouped together for a single call to Reducer#reduce(Object,Iterable,org.apache.hadoop.mapreduce.Reducer.Context)

Usage

From source file:it.crs4.seal.prq.PairReadsQSeq.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // defaults/*from w  w  w  . j  a v  a 2s .  c  om*/
    conf.set(PrqOptionParser.INPUT_FORMAT_CONF, PrqOptionParser.InputFormatDefault);

    // parse command line
    PrqOptionParser parser = new PrqOptionParser();
    parser.parse(conf, args);

    Job job = new Job(conf, "PairReadsQSeq " + parser.getInputPaths().get(0));
    job.setJarByClass(PairReadsQSeq.class);

    job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName()));
    job.setOutputFormatClass(FormatNameMap.getOutputFormat(parser.getOutputFormatName("prq")));

    job.setMapperClass(PrqMapper.class);
    job.setMapOutputKeyClass(SequenceId.class);
    job.setMapOutputValueClass(Text.class);

    job.setPartitionerClass(FirstPartitioner.class);
    job.setGroupingComparatorClass(GroupByLocationComparator.class);

    job.setReducerClass(PrqReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(ReadPair.class);

    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:it.polito.dbdmg.searum.ARM.java

License:Apache License

/**
 * Run the rule aggregator job over mined rules.
 * //from ww  w . ja  v  a2s  . co  m
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static void startRuleAggregating(Parameters params, Configuration conf)
        throws IOException, ClassNotFoundException, InterruptedException {
    conf.set("mapred.compress.map.output", "true");
    conf.set("mapred.output.compression.type", "BLOCK");

    Path input = new Path(params.get(OUTPUT), RULES);
    Job job = new Job(conf, "Rule aggregator driver running over input: " + input);
    job.setJarByClass(ARM.class);
    FileInputFormat.addInputPath(job, input);
    Path outPath = new Path(params.get(OUTPUT), RULESBYCONCLUSION);
    FileOutputFormat.setOutputPath(job, outPath);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(RuleAggregatorMapper.class);
    job.setReducerClass(RuleAggregatorReducer.class);
    job.setPartitionerClass(RulePartitionerByConclusion.class);
    job.setSortComparatorClass(RulesWritableComparator.class);
    job.setGroupingComparatorClass(RulesGroupingWritableComparator.class);

    HadoopUtil.delete(conf, outPath);
    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:ldbc.snb.datagen.hadoop.HadoopPersonActivityGenerator.java

public void run(String inputFileName) throws AssertionError, Exception {

    FileSystem fs = FileSystem.get(conf);

    System.out.println("RANKING");
    String rankedFileName = conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/ranked";
    HadoopFileRanker hadoopFileRanker = new HadoopFileRanker(conf, TupleKey.class, Person.class, null);
    hadoopFileRanker.run(inputFileName, rankedFileName);

    System.out.println("GENERATING");
    int numThreads = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.numThreads"));
    Job job = Job.getInstance(conf, "Person Activity Generator/Serializer");
    job.setMapOutputKeyClass(BlockKey.class);
    job.setMapOutputValueClass(Person.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Person.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(HadoopPersonActivityGeneratorReducer.class);
    job.setNumReduceTasks(numThreads);/* w  ww . j  ava  2s  .c o  m*/
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setSortComparatorClass(BlockKeyComparator.class);
    job.setGroupingComparatorClass(BlockKeyGroupComparator.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);

    /** PROFILING OPTIONS **/
    //job.setProfileEnabled(true);
    //job.setProfileParams("-agentlib:hprof=cpu=samples,heap=sites,depth=4,thread=y,format=b,file=%s");
    //job.setProfileTaskRange(true,"0-1");
    //job.setProfileTaskRange(false,"0-1");
    /****/

    FileInputFormat.setInputPaths(job, new Path(rankedFileName));
    FileOutputFormat.setOutputPath(job, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/aux"));
    long start = System.currentTimeMillis();
    try {
        if (!job.waitForCompletion(true)) {
            throw new Exception();
        }
    } catch (AssertionError e) {
        throw e;
    }
    System.out.println("Real time to generate activity: " + (System.currentTimeMillis() - start) / 1000.0f);

    try {
        fs.delete(new Path(rankedFileName), true);
        fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/aux"), true);
    } catch (IOException e) {
        System.err.println(e.getMessage());
        e.printStackTrace();
    }
}

From source file:ldbc.socialnet.dbgen.generator.MRGenerateUsers.java

License:Open Source License

public int runGenerateJob(Configuration conf) throws Exception {
    FileSystem fs = FileSystem.get(conf);
    String hadoopDir = new String(conf.get("outputDir") + "/hadoop");
    String socialNetDir = new String(conf.get("outputDir") + "/social_network");
    int numThreads = Integer.parseInt(conf.get("numThreads"));
    System.out.println("NUMBER OF THREADS " + numThreads);

    /// --------- Execute Jobs ------
    long start = System.currentTimeMillis();

    /// --------------- First job Generating users----------------
    printProgress("Starting: Person generation");
    conf.set("pass", Integer.toString(0));
    Job job = new Job(conf, "SIB Generate Users & 1st Dimension");
    job.setMapOutputKeyClass(TupleKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(GenerateUsersMapper.class);
    job.setMapperClass(GenerateUsersMapper.class);
    job.setNumReduceTasks(numThreads);/*from   w  w w  .j a v a 2 s. c  o  m*/
    job.setInputFormatClass(NLineInputFormat.class);
    conf.setInt("mapred.line.input.format.linespermap", 1);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, new Path(hadoopDir) + "/mrInputFile");
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib"));
    job.waitForCompletion(true);

    /// --------------- Sorting by first dimension  ----------------
    printProgress("Starting: Sorting by first dimension");
    HadoopFileRanker fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class);
    fileRanker.run(hadoopDir + "/sib", hadoopDir + "/sibSorting");
    fs.delete(new Path(hadoopDir + "/sib"), true);

    /// --------------- job Generating First dimension Friendships  ----------------
    printProgress("Starting: Friendship generation 1.");
    conf.set("pass", Integer.toString(0));
    conf.set("dimension", Integer.toString(1));
    job = new Job(conf, "SIB Generate Friendship - Interest");
    job.setMapOutputKeyClass(ComposedKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(DimensionReducer.class);
    job.setNumReduceTasks(numThreads);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);
    job.setSortComparatorClass(ComposedKeyComparator.class);
    job.setGroupingComparatorClass(ComposedKeyGroupComparator.class);

    FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting"));
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib2"));
    job.waitForCompletion(true);
    fs.delete(new Path(hadoopDir + "/sibSorting"), true);

    /// --------------- Sorting phase 2  ----------------
    printProgress("Starting: Sorting by second dimension");
    fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class);
    fileRanker.run(hadoopDir + "/sib2", hadoopDir + "/sibSorting2");
    fs.delete(new Path(hadoopDir + "/sib2"), true);

    /// --------------- Second job Generating Friendships  ----------------
    printProgress("Starting: Friendship generation 2.");
    conf.set("pass", Integer.toString(1));
    conf.set("dimension", Integer.toString(2));
    job = new Job(conf, "SIB Generate Friendship - Interest");
    job.setMapOutputKeyClass(ComposedKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(DimensionReducer.class);
    job.setNumReduceTasks(numThreads);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);
    job.setSortComparatorClass(ComposedKeyComparator.class);
    job.setGroupingComparatorClass(ComposedKeyGroupComparator.class);
    FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting2"));
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib3"));
    job.waitForCompletion(true);
    fs.delete(new Path(hadoopDir + "/sibSorting2"), true);

    /// --------------- Sorting phase 3--------------
    printProgress("Starting: Sorting by third dimension");
    fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class);
    fileRanker.run(hadoopDir + "/sib3", hadoopDir + "/sibSorting3");
    fs.delete(new Path(hadoopDir + "/sib3"), true);

    /// --------------- Third job Generating Friendships----------------
    printProgress("Starting: Friendship generation 3.");
    conf.set("pass", Integer.toString(2));
    conf.set("dimension", Integer.toString(2));
    job = new Job(conf, "SIB Generate Friendship - Random");
    job.setMapOutputKeyClass(ComposedKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(DimensionReducer.class);
    job.setNumReduceTasks(numThreads);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);
    job.setSortComparatorClass(ComposedKeyComparator.class);
    job.setGroupingComparatorClass(ComposedKeyGroupComparator.class);
    FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting3"));
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib4"));
    job.waitForCompletion(true);
    fs.delete(new Path(hadoopDir + "/sibSorting3"), true);

    /// --------------- Sorting phase 3--------------

    printProgress("Starting: Sorting by third dimension (for activity generation)");
    fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class);
    fileRanker.run(hadoopDir + "/sib4", hadoopDir + "/sibSorting4");
    fs.delete(new Path(hadoopDir + "/sib4"), true);

    /// --------------- Fourth job: Serialize static network ----------------

    printProgress("Starting: Generating person activity");
    job = new Job(conf, "Generate user activity");
    job.setMapOutputKeyClass(ComposedKey.class);
    job.setMapOutputValueClass(ReducedUserProfile.class);
    job.setOutputKeyClass(TupleKey.class);
    job.setOutputValueClass(ReducedUserProfile.class);
    job.setJarByClass(HadoopBlockMapper.class);
    job.setMapperClass(HadoopBlockMapper.class);
    job.setReducerClass(UserActivityReducer.class);
    job.setNumReduceTasks(numThreads);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setPartitionerClass(HadoopBlockPartitioner.class);
    job.setSortComparatorClass(ComposedKeyComparator.class);
    job.setGroupingComparatorClass(ComposedKeyGroupComparator.class);
    FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting4"));
    FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib5"));
    job.waitForCompletion(true);
    fs.delete(new Path(hadoopDir + "/sib5"), true);

    int numEvents = 0;
    long min = Long.MAX_VALUE;
    long max = Long.MIN_VALUE;

    if (conf.getBoolean("updateStreams", false)) {
        for (int i = 0; i < numThreads; ++i) {
            int numPartitions = conf.getInt("numUpdatePartitions", 1);
            for (int j = 0; j < numPartitions; ++j) {
                /// --------------- Fifth job: Sort update streams ----------------
                conf.setInt("mapred.line.input.format.linespermap", 1000000);
                conf.setInt("reducerId", i);
                conf.setInt("partitionId", j);
                conf.set("streamType", "forum");
                Job jobForum = new Job(conf, "Soring update streams " + j + " of reducer " + i);
                jobForum.setMapOutputKeyClass(LongWritable.class);
                jobForum.setMapOutputValueClass(Text.class);
                jobForum.setOutputKeyClass(LongWritable.class);
                jobForum.setOutputValueClass(Text.class);
                jobForum.setJarByClass(UpdateEventMapper.class);
                jobForum.setMapperClass(UpdateEventMapper.class);
                jobForum.setReducerClass(UpdateEventReducer.class);
                jobForum.setNumReduceTasks(1);
                jobForum.setInputFormatClass(SequenceFileInputFormat.class);
                jobForum.setOutputFormatClass(SequenceFileOutputFormat.class);
                jobForum.setPartitionerClass(UpdateEventPartitioner.class);
                FileInputFormat.addInputPath(jobForum,
                        new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_forum"));
                FileOutputFormat.setOutputPath(jobForum, new Path(hadoopDir + "/sibEnd"));
                printProgress("Starting: Sorting update streams");
                jobForum.waitForCompletion(true);
                fs.delete(new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_forum"), false);
                fs.delete(new Path(hadoopDir + "/sibEnd"), true);

                conf.setInt("mapred.line.input.format.linespermap", 1000000);
                conf.setInt("reducerId", i);
                conf.setInt("partitionId", j);
                conf.set("streamType", "person");
                Job jobPerson = new Job(conf, "Soring update streams " + j + " of reducer " + i);
                jobPerson.setMapOutputKeyClass(LongWritable.class);
                jobPerson.setMapOutputValueClass(Text.class);
                jobPerson.setOutputKeyClass(LongWritable.class);
                jobPerson.setOutputValueClass(Text.class);
                jobPerson.setJarByClass(UpdateEventMapper.class);
                jobPerson.setMapperClass(UpdateEventMapper.class);
                jobPerson.setReducerClass(UpdateEventReducer.class);
                jobPerson.setNumReduceTasks(1);
                jobPerson.setInputFormatClass(SequenceFileInputFormat.class);
                jobPerson.setOutputFormatClass(SequenceFileOutputFormat.class);
                jobPerson.setPartitionerClass(UpdateEventPartitioner.class);
                FileInputFormat.addInputPath(jobPerson,
                        new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_person"));
                FileOutputFormat.setOutputPath(jobPerson, new Path(hadoopDir + "/sibEnd"));
                printProgress("Starting: Sorting update streams");
                jobPerson.waitForCompletion(true);
                fs.delete(new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_person"), false);
                fs.delete(new Path(hadoopDir + "/sibEnd"), true);

                if (conf.getBoolean("updateStreams", false)) {
                    Properties properties = new Properties();
                    FSDataInputStream file = fs.open(new Path(conf.get("outputDir")
                            + "/social_network/updateStream_" + i + "_" + j + "_person.properties"));
                    properties.load(file);
                    if (properties.getProperty("min_write_event_start_time") != null) {
                        Long auxMin = Long.parseLong(properties.getProperty("min_write_event_start_time"));
                        min = auxMin < min ? auxMin : min;
                        Long auxMax = Long.parseLong(properties.getProperty("max_write_event_start_time"));
                        max = auxMax > max ? auxMax : max;
                        numEvents += Long.parseLong(properties.getProperty("num_events"));
                    }
                    file.close();
                    file = fs.open(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_"
                            + j + "_forum.properties"));
                    properties.load(file);
                    if (properties.getProperty("min_write_event_start_time") != null) {
                        Long auxMin = Long.parseLong(properties.getProperty("min_write_event_start_time"));
                        min = auxMin < min ? auxMin : min;
                        Long auxMax = Long.parseLong(properties.getProperty("max_write_event_start_time"));
                        max = auxMax > max ? auxMax : max;
                        numEvents += Long.parseLong(properties.getProperty("num_events"));
                    }
                    file.close();
                    fs.delete(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j
                            + "_person.properties"), true);
                    fs.delete(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j
                            + "_forum.properties"), true);
                }
            }
        }

        if (conf.getBoolean("updateStreams", false)) {
            OutputStream output = fs
                    .create(new Path(conf.get("outputDir") + "/social_network/updateStream.properties"));
            output.write(new String("ldbc.snb.interactive.gct_delta_duration:" + conf.get("deltaTime") + "\n")
                    .getBytes());
            output.write(
                    new String("ldbc.snb.interactive.min_write_event_start_time:" + min + "\n").getBytes());
            output.write(
                    new String("ldbc.snb.interactive.max_write_event_start_time:" + max + "\n").getBytes());
            output.write(new String("ldbc.snb.interactive.update_interleave:" + (max - min) / numEvents + "\n")
                    .getBytes());
            output.write(new String("ldbc.snb.interactive.num_events:" + numEvents).getBytes());
            output.close();
        }
    }

    /// --------------- Sixth job: Materialize the friends lists ----------------
    /*        Job job6 = new Job(conf,"Dump the friends lists");
            job6.setMapOutputKeyClass(ComposedKey.class);
            job6.setMapOutputValueClass(ReducedUserProfile.class);
            job6.setOutputKeyClass(ComposedKey.class);
            job6.setOutputValueClass(ReducedUserProfile.class);
            job6.setJarByClass(HadoopBlockMapper.class);
            job6.setMapperClass(HadoopBlockMapper.class);
            job6.setReducerClass(FriendListOutputReducer.class);
            job6.setNumReduceTasks(numThreads);
            job6.setInputFormatClass(SequenceFileInputFormat.class);
            job6.setOutputFormatClass(SequenceFileOutputFormat.class);
            job6.setPartitionerClass(HadoopBlockPartitioner.class);
            job6.setSortComparatorClass(ComposedKeyComparator.class);
            job6.setGroupingComparatorClass(ComposedKeyGroupComparator.class);
            FileInputFormat.setInputPaths(job6, new Path(hadoopDir + "/sibSorting4"));
            FileOutputFormat.setOutputPath(job6, new Path(hadoopDir + "/job6") );
            
            
            printProgress("Starting: Materialize friends for substitution parameters");
            int resMaterializeFriends = job6.waitForCompletion(true) ? 0 : 1;
            fs.delete(new Path(hadoopDir + "/sibSorting3"),true);
            */

    long end = System.currentTimeMillis();
    System.out.println(((end - start) / 1000) + " total seconds");
    for (int i = 0; i < numThreads; ++i) {
        fs.copyToLocalFile(new Path(socialNetDir + "/m" + i + "factors.txt"), new Path("./"));
        fs.copyToLocalFile(new Path(socialNetDir + "/m0friendList" + i + ".csv"), new Path("./"));
    }
    return 0;
}

From source file:license.LicenseDriver.java

public static void main(String[] args) throws Exception {
    if (args.length != 3) {
        System.out.println("usage: [students dataset path] [grades dataset path] [output]");
        System.exit(-1);/*from www.jav a  2 s  . co m*/
    }
    Configuration configuration = new Configuration();
    configuration.setClass(ILicenseNameParsingStrategy.class.getName(),
            LicenseNameWritableParsingStrategy.class, IParsingStrategy.class);
    configuration.setClass(ILicenseTypeParsingStrategy.class.getName(),
            LicenseTypeWritableParsingStrategy.class, IParsingStrategy.class);

    Job job = Job.getInstance(configuration);
    job.setOutputKeyClass(LicenseKey.class);
    job.setOutputValueClass(JoinNameAndLicense.class);
    MultipleInputs.addInputPath(job, new Path(args[0]), NamesWritableInputFormat.class,
            NamesDetailsMapper.class);
    MultipleInputs.addInputPath(job, new Path(args[1]), LicensesWritableInputFormat.class,
            LicensesDetailsMapper.class);
    job.setReducerClass(LicenseReducer.class);

    job.setOutputFormatClass(TextOutputFormat.class);
    job.setPartitionerClass(LicenseKeyPartitioner.class);
    job.setGroupingComparatorClass(LicenseGroupingComparator.class);
    FileOutputFormat.setOutputPath(job, new Path(args[2]));
    job.setJarByClass(LicenseDriver.class);
    job.submit();
}

From source file:mvm.rya.joinselect.mr.JoinSelectAggregate.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String inPath1 = conf.get(PROSPECTS_OUTPUTPATH);
    String inPath2 = conf.get(SPO_OUTPUTPATH);
    String auths = conf.get(AUTHS);
    String outPath = conf.get(OUTPUTPATH);

    assert inPath1 != null && inPath2 != null && outPath != null;

    Job job = new Job(conf, this.getClass().getSimpleName() + "_" + System.currentTimeMillis());
    job.setJarByClass(this.getClass());
    conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, true);

    JoinSelectStatsUtil.initJoinMRJob(job, inPath1, inPath2, JoinSelectAggregateMapper.class, outPath, auths);

    job.setSortComparatorClass(JoinSelectSortComparator.class);
    job.setGroupingComparatorClass(JoinSelectGroupComparator.class);
    job.setPartitionerClass(JoinSelectPartitioner.class);
    job.setReducerClass(JoinReducer.class);
    job.setNumReduceTasks(32);//from w ww  .j a v  a  2  s . c o m
    job.waitForCompletion(true);

    return job.isSuccessful() ? 0 : 1;

}

From source file:name.abhijitsarkar.hadoop.join.ReduceSideJoinDriver.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf, "reduce-side-join");
    job.setJarByClass(getClass());// w  ww  . j  a  v a 2  s . c  o m

    job.setPartitionerClass(KeyPartitioner.class);
    job.setGroupingComparatorClass(KeyGroupingComparator.class);

    job.setReducerClass(ReduceSideJoinReducer.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    MultipleInputs.addInputPath(job, new Path(args[0], "customers.txt"), TextInputFormat.class,
            CustomerMapper.class);
    MultipleInputs.addInputPath(job, new Path(args[0], "orders.txt"), TextInputFormat.class, OrderMapper.class);

    job.setMapOutputKeyClass(TaggedKey.class);
    job.setMapOutputValueClass(Text.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:nl.gridline.zieook.inx.movielens.RowSimilarityZieOok.java

License:Apache License

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    addInputOption();// w  ww . j av  a 2s  .c om
    addOutputOption();
    addOption("numberOfColumns", "r", "Number of columns in the input matrix");
    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
    addOption("maxSimilaritiesPerRow", "m",
            "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')',
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns"));
    String similarityClassnameArg = parsedArgs.get("--similarityClassname");
    String distributedSimilarityClassname;
    try {
        distributedSimilarityClassname = SimilarityType.valueOf(similarityClassnameArg)
                .getSimilarityImplementationClassName();
    } catch (IllegalArgumentException iae) {
        distributedSimilarityClassname = similarityClassnameArg;
    }

    int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow"));

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path weightsPath = new Path(tempDirPath, "weights");
    Path pairwiseSimilarityPath = new Path(tempDirPath, "pairwiseSimilarity");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job weights = prepareJob(inputPath, weightsPath, SequenceFileInputFormat.class, RowWeightMapper.class,
                VarIntWritable.class, WeightedOccurrence.class, WeightedOccurrencesPerColumnReducer.class,
                VarIntWritable.class, WeightedOccurrenceArray.class, SequenceFileOutputFormat.class);

        weights.getConfiguration().set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
        weights.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, SequenceFileInputFormat.class,
                CooccurrencesMapper.class, WeightedRowPair.class, Cooccurrence.class, SimilarityReducer.class,
                SimilarityMatrixEntryKey.class, DistributedRowMatrix.MatrixEntryWritable.class,
                SequenceFileOutputFormat.class);

        Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
        pairwiseConf.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
        pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
        pairwiseSimilarity.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job asMatrix = prepareJob(pairwiseSimilarityPath, outputPath, SequenceFileInputFormat.class,
                Mapper.class, SimilarityMatrixEntryKey.class, DistributedRowMatrix.MatrixEntryWritable.class,
                EntriesToVectorsReducer.class, IntWritable.class, VectorWritable.class,
                SequenceFileOutputFormat.class);
        asMatrix.setPartitionerClass(HashPartitioner.class);
        asMatrix.setGroupingComparatorClass(
                SimilarityMatrixEntryKey.SimilarityMatrixEntryKeyGroupingComparator.class);
        asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
        asMatrix.waitForCompletion(true);
    }

    return 0;
}

From source file:nl.gridline.zieook.runners.cf.ItemSimilarityJobZieook.java

License:Apache License

@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    addInputOption();/*from  ww  w.ja  v  a2  s .c  o m*/

    // addOutputOption(); // no output path, we use a table!
    addOption("outputtable", "ot", "Output table name");

    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
    addOption("maxSimilaritiesPerItem", "m",
            "try to cap the number of similar items per item to this number " + "(default: "
                    + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption("maxCooccurrencesPerItem", "mo",
            "try to cap the number of cooccurrences per item to this number " + "(default: "
                    + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    String similarityClassName = parsedArgs.get("--similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
    int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
    int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));

    Path inputPath = getInputPath();
    // Path outputPath = getOutputPath();
    String outputTable = parsedArgs.get("--outputtable");
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex");
    Path countUsersPath = new Path(tempDirPath, "countUsers");
    Path userVectorPath = new Path(tempDirPath, "userVectors");
    Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix");
    Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class,
                VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class,
                VarLongWritable.class, SequenceFileOutputFormat.class);
        itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
        task.setCurrentJob(itemIDIndex).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class,
                VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class,
                ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class,
                SequenceFileOutputFormat.class);
        toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
        toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
        task.setCurrentJob(toUserVector).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class,
                CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class,
                CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class);
        countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
        countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
        task.setCurrentJob(countUsers).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath,
                SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class,
                DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class,
                VectorWritable.class, SequenceFileOutputFormat.class);
        maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES,
                maxCooccurrencesPerItem);
        task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true);
    }

    int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);

    /*
     * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
     * new DistributedRowMatrix(...).rowSimilarity(...)
     */
    try {
        ToolRunner.run(getConf(), new RowSimilarityZieOok(),
                new String[] { "-Dmapred.input.dir=" + itemUserMatrixPath,
                        "-Dmapred.output.dir=" + similarityMatrixPath, "--numberOfColumns",
                        String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName,
                        "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem + 1), "--tempDir",
                        tempDirPath.toString() });
    } catch (Exception e) {
        throw new IllegalStateException("item-item-similarity computation failed", e);
    }

    // This step writes the data to a file, we don't want that, it should be written in HBase directly:
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job mostSimilarItems = prepareMostSimilarItems(similarityMatrixPath, outputTable);

        // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();

        // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
        // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);

        // mostSimilarItems.waitForCompletion(true);

        task.setCurrentJob(mostSimilarItems).waitForCompletion(Log.isDebugEnabled());

        // Job mostSimilarItems = prepareJob(similarityMatrixPath, outputPath, SequenceFileInputFormat.class,
        // MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
        // MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class,
        // TextOutputFormat.class);
        // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
        // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
        // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
        // mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class);
        // mostSimilarItems.waitForCompletion(true);
    }

    return 0;
}

From source file:nl.gridline.zieook.runners.cf.RecommenderJobZieOok.java

License:Apache License

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    addInputOption();//from   w w w. j ava  2s .co m
    addOutputOption();
    addOption("numRecommendations", "n", "Number of recommendations per user",
            String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS));
    addOption("usersFile", "u", "File of users to recommend for", null);
    addOption("itemsFile", "i", "File of items to recommend for", null);
    addOption("filterFile", "f",
            "File containing comma-separated userID,itemID pairs. Used to exclude the item from "
                    + "the recommendations for that user (optional)",
            null);
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("maxPrefsPerUser", "mp",
            "Maximum number of preferences considered per user in final recommendation phase",
            String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this in the similarity computation " + "(default: "
                    + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ",
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM));
    addOption("maxCooccurrencesPerItem", "mo",
            "try to cap the number of cooccurrences per item to this " + "number (default: "
                    + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')',
            String.valueOf(SimilarityType.SIMILARITY_COOCCURRENCE));

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));
    int numRecommendations = Integer.parseInt(parsedArgs.get("--numRecommendations"));
    String usersFile = parsedArgs.get("--usersFile");
    String itemsFile = parsedArgs.get("--itemsFile");
    String filterFile = parsedArgs.get("--filterFile");
    boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
    int maxPrefsPerUser = Integer.parseInt(parsedArgs.get("--maxPrefsPerUser"));
    int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
    int maxSimilaritiesPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
    int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
    String similarityClassname = parsedArgs.get("--similarityClassname");

    Path userVectorPath = new Path(tempDirPath, "userVectors");
    Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex");
    Path countUsersPath = new Path(tempDirPath, "countUsers");
    Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix");
    Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix");
    Path prePartialMultiplyPath1 = new Path(tempDirPath, "prePartialMultiply1");
    Path prePartialMultiplyPath2 = new Path(tempDirPath, "prePartialMultiply2");
    Path explicitFilterPath = new Path(tempDirPath, "explicitFilterPath");
    Path partialMultiplyPath = new Path(tempDirPath, "partialMultiply");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class,
                VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class,
                VarLongWritable.class, SequenceFileOutputFormat.class);
        itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
        task.setCurrentJob(itemIDIndex).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class,
                VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class,
                ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class,
                SequenceFileOutputFormat.class);
        toUserVector.getConfiguration().setBoolean(BOOLEAN_DATA, booleanData);
        toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
        task.setCurrentJob(toUserVector).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class,
                CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class,
                CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class);
        countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
        countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
        task.setCurrentJob(countUsers).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath,
                SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class,
                DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class,
                VectorWritable.class, SequenceFileOutputFormat.class);
        maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES,
                maxCooccurrencesPerItem);
        task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true);
    }

    int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        /*
         * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
         * new DistributedRowMatrix(...).rowSimilarity(...)
         */
        try {
            ToolRunner.run(getConf(), new RowSimilarityZieOok(), new String[] { //
                    "--input", itemUserMatrixPath.toString(), //
                    "--output", similarityMatrixPath.toString(), //
                    "--numberOfColumns", String.valueOf(numberOfUsers), //
                    "--similarityClassname", similarityClassname, //
                    "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem + 1), //
                    "--tempDir", tempDirPath.toString() });
        } catch (Exception e) {
            throw new IllegalStateException("item-item-similarity computation failed", e);
        }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job prePartialMultiply1 = prepareJob(similarityMatrixPath, prePartialMultiplyPath1,
                SequenceFileInputFormat.class, SimilarityMatrixRowWrapperMapper.class, VarIntWritable.class,
                VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class,
                SequenceFileOutputFormat.class);
        task.setCurrentJob(prePartialMultiply1).waitForCompletion(true);

        Job prePartialMultiply2 = prepareJob(userVectorPath, prePartialMultiplyPath2,
                SequenceFileInputFormat.class, UserVectorSplitterMapper.class, VarIntWritable.class,
                VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class,
                SequenceFileOutputFormat.class);
        if (usersFile != null) {
            prePartialMultiply2.getConfiguration().set(UserVectorSplitterMapper.USERS_FILE, usersFile);
        }
        prePartialMultiply2.getConfiguration().setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED,
                maxPrefsPerUser);
        task.setCurrentJob(prePartialMultiply2).waitForCompletion(true);

        Job partialMultiply = prepareJob(new Path(prePartialMultiplyPath1 + "," + prePartialMultiplyPath2),
                partialMultiplyPath, SequenceFileInputFormat.class, Mapper.class, VarIntWritable.class,
                VectorOrPrefWritable.class, ToVectorAndPrefReducer.class, VarIntWritable.class,
                VectorAndPrefsWritable.class, SequenceFileOutputFormat.class);

        /* necessary to make this job (having a combined input path) work on Amazon S3 */
        Configuration partialMultiplyConf = partialMultiply.getConfiguration();
        FileSystem fs = FileSystem.get(tempDirPath.toUri(), partialMultiplyConf);
        prePartialMultiplyPath1 = prePartialMultiplyPath1.makeQualified(fs);
        prePartialMultiplyPath2 = prePartialMultiplyPath2.makeQualified(fs);
        FileInputFormat.setInputPaths(partialMultiply, prePartialMultiplyPath1, prePartialMultiplyPath2);
        task.setCurrentJob(partialMultiply).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {

        /* convert the user/item pairs to filter if a filterfile has been specified */
        if (filterFile != null) {
            Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class,
                    ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class,
                    ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,
                    SequenceFileOutputFormat.class);
            task.setCurrentJob(itemFiltering).waitForCompletion(true);
        }

        String aggregateAndRecommendInput = partialMultiplyPath.toString();
        if (filterFile != null) {
            aggregateAndRecommendInput += "," + explicitFilterPath;
        }

        Job aggregateAndRecommend = prepareJob(new Path(aggregateAndRecommendInput), outputPath,
                SequenceFileInputFormat.class, PartialMultiplyMapper.class, VarLongWritable.class,
                PrefAndSimilarityColumnWritable.class, AggregateAndRecommendReducer.class,
                VarLongWritable.class, RecommendedItemsWritable.class, SequenceFileOutputFormat.class);
        Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();
        if (itemsFile != null) {
            aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile);
        }

        if (filterFile != null) {
            /* necessary to make this job (having a combined input path) work on Amazon S3 */
            FileSystem fs = FileSystem.get(tempDirPath.toUri(), aggregateAndRecommendConf);
            partialMultiplyPath = partialMultiplyPath.makeQualified(fs);
            explicitFilterPath = explicitFilterPath.makeQualified(fs);
            FileInputFormat.setInputPaths(aggregateAndRecommend, partialMultiplyPath, explicitFilterPath);
        }
        setIOSort(aggregateAndRecommend);
        aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH,
                itemIDIndexPath.toString());
        aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations);
        aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData);
        task.setCurrentJob(aggregateAndRecommend).waitForCompletion(true);
    }

    return 0;
}