Example usage for org.apache.hadoop.mapreduce Job setCombinerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setCombinerClass.

Prototype

public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException

Source Link

Document

Set the combiner class for the job.

Usage

From source file:nl.basjes.hadoop.examples.WordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName("Wordcount");

    job.setJarByClass(getClass());/*w w  w .  jav a 2 s .c o  m*/
    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    if (args.length > 2 && "-s".equals(args[2])) {
        if (args.length < 4) {
            System.out.println("Invalid parameters. Usage: -s <split.maxsize>");
            return -1;
        }
        job.getConfiguration().set("io.compression.codecs", "nl.basjes.hadoop.io.compress.SplittableGzipCodec");
        job.getConfiguration().setLong("mapreduce.input.fileinputformat.split.minsize",
                Long.parseLong(args[3]) - 10000);
        job.getConfiguration().setLong("mapreduce.input.fileinputformat.split.maxsize",
                Long.parseLong(args[3]));
        job.setJobName("Wordcount-" + args[3]);
    }

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(WordSplittingMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(TextOutputFormat.class);

    return (job.waitForCompletion(true) ? 1 : 0);

}

From source file:nl.basjes.hadoop.io.input.Wordcount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        return 2;
    }//from   ww  w. j a  v a 2  s  .  c  o m

    conf.set("nl.basjes.parse.apachehttpdlogline.format", logFormat);

    // A ',' separated list of fields
    conf.set("nl.basjes.parse.apachehttpdlogline.fields", "STRING:request.status.last");

    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(Wordcount.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));

    job.setInputFormatClass(ApacheHttpdLogfileInputFormat.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    // configuration should contain reference to your namenode
    FileSystem fs = FileSystem.get(conf);
    // true stands for recursively deleting the folder you gave
    Path outputPath = new Path(otherArgs[1]);
    fs.delete(outputPath, true);
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    if (job.waitForCompletion(true)) {
        return 0;
    }
    return 1;
}

From source file:nl.cwi.hadoop.kba.stat.CountGenres.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;// w ww.  j  av  a2 s  .c o m
    String out = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else {
                other_args.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0) {
        return printUsage();
    }

    if (in == null || out == null)
        return printUsage();

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();
    Job job = new Job(conf, "Count genres");
    job.setJarByClass(CountGenres.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:nl.cwi.kba2013.apps.AnnotationExtractor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//from  www .  ja v  a2 s. c  o  m
    String out = null;

    String annoFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];
            } else if ("-q".equals(args[i])) {

            } else if ("-a".equals(args[i])) {
                annoFile = args[++i];

            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();

    conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");

    FileSystem fs = FileSystem.get(conf);
    // Lookup required data from the topic file

    Job job = new Job(conf, "Annotation Extractor");
    job.setJarByClass(AnnotationExtractor.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    //job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS),
            job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(TextInputFormat.class);
    //job.setMapperClass(MyMapper.class);
    FileInputFormat.addInputPath(job, new Path(in));

    //job.setMapOutputKeyClass(Text.class);
    //job.setMapOutputValueClass(Text.class);

    job.setCombinerClass(MyReducer.class);
    //job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(1);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // Let's go
    int status = job.waitForCompletion(true) ? 0 : 1;

    return status;

}

From source file:nl.cwi.kba2013.apps.KBANameVariantMatchTHERank.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;/*from   w ww  .  j a  v  a2s.  c om*/
    String out = null;

    String labelsFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];

            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || labelsFile == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();

    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(KBANameVariantMatchTHERank.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    //job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);
    FileInputFormat.addInputPath(job, new Path(in));
    job.setNumReduceTasks(50);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    int status = job.waitForCompletion(true) ? 0 : 1;

    return status;

}

From source file:nl.cwi.kba2013.apps.KBANER.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    String in = null;//from  w ww. j a  v  a 2  s  .c o m
    String out = null;

    String labelsFile = null;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-i".equals(args[i])) {
                in = args[++i];
            } else if ("-o".equals(args[i])) {
                out = args[++i];

            } else if ("-l".equals(args[i])) {
                labelsFile = args[++i];
            } else if ("-h".equals(args[i]) || "--help".equals(args[i])) {
                return printUsage();
            } else {
                other_args.add(args[i]);
            }
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }

    if (other_args.size() > 0 || in == null || out == null || labelsFile == null)
        return printUsage();

    LOG.info("Tool: " + this.getClass().getName());
    LOG.info(" - input path: " + in);
    LOG.info(" - output path: " + out);

    Configuration conf = getConf();

    conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString());

    // set time
    conf.setLong("mapred.task.timeout", 40 * 600000);
    conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit");
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    Job job = new Job(conf, "Feature Extractor");
    job.setJarByClass(KBANER.class);

    // some weird issues with Thrift classes in the Hadoop distro.
    job.setUserClassesTakesPrecedence(true);

    // make the query file available to each mapper.

    DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS),
            job.getConfiguration());

    DistributedCache.createSymlink(job.getConfiguration());

    job.setInputFormatClass(ThriftFileInputFormat.class);
    //job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyReducer.class);
    job.setReducerClass(MyReducer.class);
    FileInputFormat.addInputPath(job, new Path(in));
    job.setNumReduceTasks(0);

    FileSystem.get(conf).delete(new Path(out), true);
    TextOutputFormat.setOutputPath(job, new Path(out));
    //job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    //job.setOutputValueClass(StreamItemWritable.class);
    job.setOutputValueClass(Text.class);
    //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    int status = job.waitForCompletion(true) ? 0 : 1;

    return status;

}

From source file:nl.gridline.zieook.runners.cf.ItemSimilarityJobZieook.java

License:Apache License

@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    addInputOption();//from w  w  w . ja  v a2  s  .  c o  m

    // addOutputOption(); // no output path, we use a table!
    addOption("outputtable", "ot", "Output table name");

    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
    addOption("maxSimilaritiesPerItem", "m",
            "try to cap the number of similar items per item to this number " + "(default: "
                    + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption("maxCooccurrencesPerItem", "mo",
            "try to cap the number of cooccurrences per item to this number " + "(default: "
                    + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    String similarityClassName = parsedArgs.get("--similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
    int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
    int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));

    Path inputPath = getInputPath();
    // Path outputPath = getOutputPath();
    String outputTable = parsedArgs.get("--outputtable");
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex");
    Path countUsersPath = new Path(tempDirPath, "countUsers");
    Path userVectorPath = new Path(tempDirPath, "userVectors");
    Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix");
    Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class,
                VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class,
                VarLongWritable.class, SequenceFileOutputFormat.class);
        itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
        task.setCurrentJob(itemIDIndex).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class,
                VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class,
                ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class,
                SequenceFileOutputFormat.class);
        toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
        toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
        task.setCurrentJob(toUserVector).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class,
                CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class,
                CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class);
        countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
        countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
        task.setCurrentJob(countUsers).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath,
                SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class,
                DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class,
                VectorWritable.class, SequenceFileOutputFormat.class);
        maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES,
                maxCooccurrencesPerItem);
        task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true);
    }

    int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);

    /*
     * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
     * new DistributedRowMatrix(...).rowSimilarity(...)
     */
    try {
        ToolRunner.run(getConf(), new RowSimilarityZieOok(),
                new String[] { "-Dmapred.input.dir=" + itemUserMatrixPath,
                        "-Dmapred.output.dir=" + similarityMatrixPath, "--numberOfColumns",
                        String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName,
                        "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem + 1), "--tempDir",
                        tempDirPath.toString() });
    } catch (Exception e) {
        throw new IllegalStateException("item-item-similarity computation failed", e);
    }

    // This step writes the data to a file, we don't want that, it should be written in HBase directly:
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job mostSimilarItems = prepareMostSimilarItems(similarityMatrixPath, outputTable);

        // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();

        // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
        // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);

        // mostSimilarItems.waitForCompletion(true);

        task.setCurrentJob(mostSimilarItems).waitForCompletion(Log.isDebugEnabled());

        // Job mostSimilarItems = prepareJob(similarityMatrixPath, outputPath, SequenceFileInputFormat.class,
        // MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
        // MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class,
        // TextOutputFormat.class);
        // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
        // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
        // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
        // mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class);
        // mostSimilarItems.waitForCompletion(true);
    }

    return 0;
}

From source file:nl.gridline.zieook.runners.cf.RecommenderJobZieOok.java

License:Apache License

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    addInputOption();/*from   ww w.  jav a2  s. co  m*/
    addOutputOption();
    addOption("numRecommendations", "n", "Number of recommendations per user",
            String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS));
    addOption("usersFile", "u", "File of users to recommend for", null);
    addOption("itemsFile", "i", "File of items to recommend for", null);
    addOption("filterFile", "f",
            "File containing comma-separated userID,itemID pairs. Used to exclude the item from "
                    + "the recommendations for that user (optional)",
            null);
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("maxPrefsPerUser", "mp",
            "Maximum number of preferences considered per user in final recommendation phase",
            String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this in the similarity computation " + "(default: "
                    + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ",
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM));
    addOption("maxCooccurrencesPerItem", "mo",
            "try to cap the number of cooccurrences per item to this " + "number (default: "
                    + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')',
            String.valueOf(SimilarityType.SIMILARITY_COOCCURRENCE));

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));
    int numRecommendations = Integer.parseInt(parsedArgs.get("--numRecommendations"));
    String usersFile = parsedArgs.get("--usersFile");
    String itemsFile = parsedArgs.get("--itemsFile");
    String filterFile = parsedArgs.get("--filterFile");
    boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
    int maxPrefsPerUser = Integer.parseInt(parsedArgs.get("--maxPrefsPerUser"));
    int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
    int maxSimilaritiesPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
    int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
    String similarityClassname = parsedArgs.get("--similarityClassname");

    Path userVectorPath = new Path(tempDirPath, "userVectors");
    Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex");
    Path countUsersPath = new Path(tempDirPath, "countUsers");
    Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix");
    Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix");
    Path prePartialMultiplyPath1 = new Path(tempDirPath, "prePartialMultiply1");
    Path prePartialMultiplyPath2 = new Path(tempDirPath, "prePartialMultiply2");
    Path explicitFilterPath = new Path(tempDirPath, "explicitFilterPath");
    Path partialMultiplyPath = new Path(tempDirPath, "partialMultiply");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class,
                VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class,
                VarLongWritable.class, SequenceFileOutputFormat.class);
        itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
        task.setCurrentJob(itemIDIndex).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class,
                VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class,
                ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class,
                SequenceFileOutputFormat.class);
        toUserVector.getConfiguration().setBoolean(BOOLEAN_DATA, booleanData);
        toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
        task.setCurrentJob(toUserVector).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class,
                CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class,
                CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class);
        countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
        countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
        task.setCurrentJob(countUsers).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath,
                SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class,
                DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class,
                VectorWritable.class, SequenceFileOutputFormat.class);
        maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES,
                maxCooccurrencesPerItem);
        task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true);
    }

    int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        /*
         * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
         * new DistributedRowMatrix(...).rowSimilarity(...)
         */
        try {
            ToolRunner.run(getConf(), new RowSimilarityZieOok(), new String[] { //
                    "--input", itemUserMatrixPath.toString(), //
                    "--output", similarityMatrixPath.toString(), //
                    "--numberOfColumns", String.valueOf(numberOfUsers), //
                    "--similarityClassname", similarityClassname, //
                    "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem + 1), //
                    "--tempDir", tempDirPath.toString() });
        } catch (Exception e) {
            throw new IllegalStateException("item-item-similarity computation failed", e);
        }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job prePartialMultiply1 = prepareJob(similarityMatrixPath, prePartialMultiplyPath1,
                SequenceFileInputFormat.class, SimilarityMatrixRowWrapperMapper.class, VarIntWritable.class,
                VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class,
                SequenceFileOutputFormat.class);
        task.setCurrentJob(prePartialMultiply1).waitForCompletion(true);

        Job prePartialMultiply2 = prepareJob(userVectorPath, prePartialMultiplyPath2,
                SequenceFileInputFormat.class, UserVectorSplitterMapper.class, VarIntWritable.class,
                VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class,
                SequenceFileOutputFormat.class);
        if (usersFile != null) {
            prePartialMultiply2.getConfiguration().set(UserVectorSplitterMapper.USERS_FILE, usersFile);
        }
        prePartialMultiply2.getConfiguration().setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED,
                maxPrefsPerUser);
        task.setCurrentJob(prePartialMultiply2).waitForCompletion(true);

        Job partialMultiply = prepareJob(new Path(prePartialMultiplyPath1 + "," + prePartialMultiplyPath2),
                partialMultiplyPath, SequenceFileInputFormat.class, Mapper.class, VarIntWritable.class,
                VectorOrPrefWritable.class, ToVectorAndPrefReducer.class, VarIntWritable.class,
                VectorAndPrefsWritable.class, SequenceFileOutputFormat.class);

        /* necessary to make this job (having a combined input path) work on Amazon S3 */
        Configuration partialMultiplyConf = partialMultiply.getConfiguration();
        FileSystem fs = FileSystem.get(tempDirPath.toUri(), partialMultiplyConf);
        prePartialMultiplyPath1 = prePartialMultiplyPath1.makeQualified(fs);
        prePartialMultiplyPath2 = prePartialMultiplyPath2.makeQualified(fs);
        FileInputFormat.setInputPaths(partialMultiply, prePartialMultiplyPath1, prePartialMultiplyPath2);
        task.setCurrentJob(partialMultiply).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {

        /* convert the user/item pairs to filter if a filterfile has been specified */
        if (filterFile != null) {
            Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class,
                    ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class,
                    ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,
                    SequenceFileOutputFormat.class);
            task.setCurrentJob(itemFiltering).waitForCompletion(true);
        }

        String aggregateAndRecommendInput = partialMultiplyPath.toString();
        if (filterFile != null) {
            aggregateAndRecommendInput += "," + explicitFilterPath;
        }

        Job aggregateAndRecommend = prepareJob(new Path(aggregateAndRecommendInput), outputPath,
                SequenceFileInputFormat.class, PartialMultiplyMapper.class, VarLongWritable.class,
                PrefAndSimilarityColumnWritable.class, AggregateAndRecommendReducer.class,
                VarLongWritable.class, RecommendedItemsWritable.class, SequenceFileOutputFormat.class);
        Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();
        if (itemsFile != null) {
            aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile);
        }

        if (filterFile != null) {
            /* necessary to make this job (having a combined input path) work on Amazon S3 */
            FileSystem fs = FileSystem.get(tempDirPath.toUri(), aggregateAndRecommendConf);
            partialMultiplyPath = partialMultiplyPath.makeQualified(fs);
            explicitFilterPath = explicitFilterPath.makeQualified(fs);
            FileInputFormat.setInputPaths(aggregateAndRecommend, partialMultiplyPath, explicitFilterPath);
        }
        setIOSort(aggregateAndRecommend);
        aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH,
                itemIDIndexPath.toString());
        aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations);
        aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData);
        task.setCurrentJob(aggregateAndRecommend).waitForCompletion(true);
    }

    return 0;
}

From source file:nl.utwente.bigdata.GoalPlayerCount.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: GoalPlayerCount <in> [<in>...] <out>");
        System.exit(2);//www  .ja  v a  2 s .c  om
    }
    Job job = new Job(conf, "GoldenBoot GoalPlayerCount");
    job.setJarByClass(GoalPlayerCount.class);
    job.setMapperClass(CountMapper.class);
    job.setCombinerClass(CountReducer.class);
    job.setReducerClass(CountReducer.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputKeyClass(TextInputFormat.class);
    job.setOutputValueClass(IntWritable.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:nl.utwente.bigdata.GoalScorerDefiner.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: userCount <in> [<in>...] <out>");
        System.exit(2);//from ww  w  .  j  ava 2  s  .c  o  m
    }
    Job job = new Job(conf, "GoalScorerDefiner");
    job.setJarByClass(GoalScorerDefiner.class);
    job.setMapperClass(GoalScorerDefiner.ScoreMapper.class);
    job.setCombinerClass(GoalScorerDefiner.ScoreReducer.class);
    job.setReducerClass(GoalScorerDefiner.ScoreReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}