Example usage for org.apache.hadoop.mapreduce Job setPartitionerClass

List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException 

Source Link

Document

Set the Partitioner for the job.

Usage

From source file:license.LicenseDriver.java

public static void main(String[] args) throws Exception {
    if (args.length != 3) {
        System.out.println("usage: [students dataset path] [grades dataset path] [output]");
        System.exit(-1);//from   w  w  w  .j a v  a  2s . c o m
    }
    Configuration configuration = new Configuration();
    configuration.setClass(ILicenseNameParsingStrategy.class.getName(),
            LicenseNameWritableParsingStrategy.class, IParsingStrategy.class);
    configuration.setClass(ILicenseTypeParsingStrategy.class.getName(),
            LicenseTypeWritableParsingStrategy.class, IParsingStrategy.class);

    Job job = Job.getInstance(configuration);
    job.setOutputKeyClass(LicenseKey.class);
    job.setOutputValueClass(JoinNameAndLicense.class);
    MultipleInputs.addInputPath(job, new Path(args[0]), NamesWritableInputFormat.class,
            NamesDetailsMapper.class);
    MultipleInputs.addInputPath(job, new Path(args[1]), LicensesWritableInputFormat.class,
            LicensesDetailsMapper.class);
    job.setReducerClass(LicenseReducer.class);

    job.setOutputFormatClass(TextOutputFormat.class);
    job.setPartitionerClass(LicenseKeyPartitioner.class);
    job.setGroupingComparatorClass(LicenseGroupingComparator.class);
    FileOutputFormat.setOutputPath(job, new Path(args[2]));
    job.setJarByClass(LicenseDriver.class);
    job.submit();
}

From source file:model.AutoCoder.java

License:Apache License

/**
 * Runs this tool.//w  w w  . j ava 2s. c o m
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT) + "/part*";
    String outputPath = cmdline.getOptionValue(OUTPUT);
    //String inputPath = "mingled_v2/part*";
    //String outputPath = "output";
    String dataPath = cmdline.getOptionValue(INPUT) + "/common";
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    LOG.info("Tool: " + AutoCoder.class.getSimpleName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);
    Configuration conf = getConf();
    initialParameters(conf);

    conf.set("dataPath", dataPath);

    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");

    Job job = Job.getInstance(conf);
    job.setJobName(AutoCoder.class.getSimpleName());
    job.setJarByClass(AutoCoder.class);
    // set the path of the information of k clusters in this iteration
    job.getConfiguration().set("sidepath", inputPath + "/side_output");
    job.setNumReduceTasks(reduceTasks);

    dataShuffle();

    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileInputFormat.setMaxInputSplitSize(job, 1000 * 1024 * 1024);
    FileInputFormat.setMinInputSplitSize(job, 1000 * 1024 * 1024);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(ModelNode.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(SuperModel.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);
    job.setPartitionerClass(MyPartitioner.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(getConf()).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    //prepareNextIteration(inputPath0, outputPath,iterations,conf,reduceTasks);

    return 0;
}

From source file:mvm.rya.accumulo.mr.fileinput.BulkNtripsInputTool.java

License:Apache License

@Override
public int run(final String[] args) throws Exception {
    final Configuration conf = getConf();
    try {/*from www.j  a v a 2 s  .  c  o  m*/
        //conf
        zk = conf.get(MRUtils.AC_ZK_PROP, zk);
        ttl = conf.get(MRUtils.AC_TTL_PROP, ttl);
        instance = conf.get(MRUtils.AC_INSTANCE_PROP, instance);
        userName = conf.get(MRUtils.AC_USERNAME_PROP, userName);
        pwd = conf.get(MRUtils.AC_PWD_PROP, pwd);
        workDirBase = conf.get(WORKDIR_PROP, workDirBase);
        format = conf.get(MRUtils.FORMAT_PROP, format);
        conf.set(MRUtils.FORMAT_PROP, format);
        final String inputDir = args[0];

        ZooKeeperInstance zooKeeperInstance = new ZooKeeperInstance(instance, zk);
        Connector connector = zooKeeperInstance.getConnector(userName, new PasswordToken(pwd));
        TableOperations tableOperations = connector.tableOperations();

        if (conf.get(AccumuloRdfConfiguration.CONF_ADDITIONAL_INDEXERS) != null) {
            throw new IllegalArgumentException("Cannot use Bulk N Trips tool with Additional Indexers");
        }

        String tablePrefix = conf.get(MRUtils.TABLE_PREFIX_PROPERTY, null);
        if (tablePrefix != null)
            RdfCloudTripleStoreConstants.prefixTables(tablePrefix);
        String[] tables = { tablePrefix + RdfCloudTripleStoreConstants.TBL_OSP_SUFFIX,
                tablePrefix + RdfCloudTripleStoreConstants.TBL_SPO_SUFFIX,
                tablePrefix + RdfCloudTripleStoreConstants.TBL_PO_SUFFIX };
        Collection<Job> jobs = new ArrayList<Job>();
        for (final String tableName : tables) {
            PrintStream out = null;
            try {
                String workDir = workDirBase + "/" + tableName;
                System.out.println("Loading data into table[" + tableName + "]");

                Job job = new Job(new Configuration(conf),
                        "Bulk Ingest load data to Generic RDF Table[" + tableName + "]");
                job.setJarByClass(this.getClass());
                //setting long job
                Configuration jobConf = job.getConfiguration();
                jobConf.setBoolean("mapred.map.tasks.speculative.execution", false);
                jobConf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
                jobConf.set("io.sort.mb", jobConf.get("io.sort.mb", "256"));
                jobConf.setBoolean("mapred.compress.map.output", true);
                //                    jobConf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); //TODO: I would like LZO compression

                job.setInputFormatClass(TextInputFormat.class);

                job.setMapperClass(ParseNtripsMapper.class);
                job.setMapOutputKeyClass(Key.class);
                job.setMapOutputValueClass(Value.class);

                job.setCombinerClass(OutStmtMutationsReducer.class);
                job.setReducerClass(OutStmtMutationsReducer.class);
                job.setOutputFormatClass(AccumuloFileOutputFormat.class);
                // AccumuloFileOutputFormat.setZooKeeperInstance(jobConf, instance, zk);

                jobConf.set(ParseNtripsMapper.TABLE_PROPERTY, tableName);

                TextInputFormat.setInputPaths(job, new Path(inputDir));

                FileSystem fs = FileSystem.get(conf);
                Path workPath = new Path(workDir);
                if (fs.exists(workPath))
                    fs.delete(workPath, true);

                //make failures dir
                Path failures = new Path(workDir, "failures");
                fs.delete(failures, true);
                fs.mkdirs(new Path(workDir, "failures"));

                AccumuloFileOutputFormat.setOutputPath(job, new Path(workDir + "/files"));

                out = new PrintStream(new BufferedOutputStream(fs.create(new Path(workDir + "/splits.txt"))));

                if (!tableOperations.exists(tableName))
                    tableOperations.create(tableName);
                Collection<Text> splits = tableOperations.getSplits(tableName, Integer.MAX_VALUE);
                for (Text split : splits)
                    out.println(new String(Base64.encodeBase64(TextUtil.getBytes(split))));

                job.setNumReduceTasks(splits.size() + 1);
                out.close();

                job.setPartitionerClass(KeyRangePartitioner.class);
                RangePartitioner.setSplitFile(job, workDir + "/splits.txt");

                jobConf.set(WORKDIR_PROP, workDir);

                job.submit();
                jobs.add(job);

            } catch (Exception re) {
                throw new RuntimeException(re);
            } finally {
                if (out != null)
                    out.close();
            }
        }

        for (Job job : jobs) {
            while (!job.isComplete()) {
                Thread.sleep(1000);
            }
        }

        for (String tableName : tables) {
            String workDir = workDirBase + "/" + tableName;
            String filesDir = workDir + "/files";
            String failuresDir = workDir + "/failures";

            FileSystem fs = FileSystem.get(conf);

            //make sure that the "accumulo" user can read/write/execute into these directories this path
            fs.setPermission(new Path(filesDir), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
            fs.setPermission(new Path(failuresDir), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));

            tableOperations.importDirectory(tableName, filesDir, failuresDir, false);

        }

    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    return 0;
}

From source file:mvm.rya.joinselect.mr.JoinSelectAggregate.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    String inPath1 = conf.get(PROSPECTS_OUTPUTPATH);
    String inPath2 = conf.get(SPO_OUTPUTPATH);
    String auths = conf.get(AUTHS);
    String outPath = conf.get(OUTPUTPATH);

    assert inPath1 != null && inPath2 != null && outPath != null;

    Job job = new Job(conf, this.getClass().getSimpleName() + "_" + System.currentTimeMillis());
    job.setJarByClass(this.getClass());
    conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, true);

    JoinSelectStatsUtil.initJoinMRJob(job, inPath1, inPath2, JoinSelectAggregateMapper.class, outPath, auths);

    job.setSortComparatorClass(JoinSelectSortComparator.class);
    job.setGroupingComparatorClass(JoinSelectGroupComparator.class);
    job.setPartitionerClass(JoinSelectPartitioner.class);
    job.setReducerClass(JoinReducer.class);
    job.setNumReduceTasks(32);//from  ww  w. j  av  a 2  s  .c  om
    job.waitForCompletion(true);

    return job.isSuccessful() ? 0 : 1;

}

From source file:name.abhijitsarkar.hadoop.join.ReduceSideJoinDriver.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    Job job = new Job(conf, "reduce-side-join");
    job.setJarByClass(getClass());/*from   w w  w .j a va2  s . c o m*/

    job.setPartitionerClass(KeyPartitioner.class);
    job.setGroupingComparatorClass(KeyGroupingComparator.class);

    job.setReducerClass(ReduceSideJoinReducer.class);

    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    MultipleInputs.addInputPath(job, new Path(args[0], "customers.txt"), TextInputFormat.class,
            CustomerMapper.class);
    MultipleInputs.addInputPath(job, new Path(args[0], "orders.txt"), TextInputFormat.class, OrderMapper.class);

    job.setMapOutputKeyClass(TaggedKey.class);
    job.setMapOutputValueClass(Text.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:nl.gridline.zieook.inx.movielens.RowSimilarityZieOok.java

License:Apache License

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    addInputOption();/*from w w  w. j av  a  2 s. c  om*/
    addOutputOption();
    addOption("numberOfColumns", "r", "Number of columns in the input matrix");
    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
    addOption("maxSimilaritiesPerRow", "m",
            "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')',
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns"));
    String similarityClassnameArg = parsedArgs.get("--similarityClassname");
    String distributedSimilarityClassname;
    try {
        distributedSimilarityClassname = SimilarityType.valueOf(similarityClassnameArg)
                .getSimilarityImplementationClassName();
    } catch (IllegalArgumentException iae) {
        distributedSimilarityClassname = similarityClassnameArg;
    }

    int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow"));

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path weightsPath = new Path(tempDirPath, "weights");
    Path pairwiseSimilarityPath = new Path(tempDirPath, "pairwiseSimilarity");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job weights = prepareJob(inputPath, weightsPath, SequenceFileInputFormat.class, RowWeightMapper.class,
                VarIntWritable.class, WeightedOccurrence.class, WeightedOccurrencesPerColumnReducer.class,
                VarIntWritable.class, WeightedOccurrenceArray.class, SequenceFileOutputFormat.class);

        weights.getConfiguration().set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
        weights.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, SequenceFileInputFormat.class,
                CooccurrencesMapper.class, WeightedRowPair.class, Cooccurrence.class, SimilarityReducer.class,
                SimilarityMatrixEntryKey.class, DistributedRowMatrix.MatrixEntryWritable.class,
                SequenceFileOutputFormat.class);

        Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
        pairwiseConf.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
        pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
        pairwiseSimilarity.waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job asMatrix = prepareJob(pairwiseSimilarityPath, outputPath, SequenceFileInputFormat.class,
                Mapper.class, SimilarityMatrixEntryKey.class, DistributedRowMatrix.MatrixEntryWritable.class,
                EntriesToVectorsReducer.class, IntWritable.class, VectorWritable.class,
                SequenceFileOutputFormat.class);
        asMatrix.setPartitionerClass(HashPartitioner.class);
        asMatrix.setGroupingComparatorClass(
                SimilarityMatrixEntryKey.SimilarityMatrixEntryKeyGroupingComparator.class);
        asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
        asMatrix.waitForCompletion(true);
    }

    return 0;
}

From source file:nl.gridline.zieook.runners.cf.ItemSimilarityJobZieook.java

License:Apache License

@Override
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    addInputOption();//from   w w  w.jav  a 2s. c  o m

    // addOutputOption(); // no output path, we use a table!
    addOption("outputtable", "ot", "Output table name");

    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
    addOption("maxSimilaritiesPerItem", "m",
            "try to cap the number of similar items per item to this number " + "(default: "
                    + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption("maxCooccurrencesPerItem", "mo",
            "try to cap the number of cooccurrences per item to this number " + "(default: "
                    + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    String similarityClassName = parsedArgs.get("--similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
    int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
    int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));

    Path inputPath = getInputPath();
    // Path outputPath = getOutputPath();
    String outputTable = parsedArgs.get("--outputtable");
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));

    Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex");
    Path countUsersPath = new Path(tempDirPath, "countUsers");
    Path userVectorPath = new Path(tempDirPath, "userVectors");
    Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix");
    Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class,
                VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class,
                VarLongWritable.class, SequenceFileOutputFormat.class);
        itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
        task.setCurrentJob(itemIDIndex).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class,
                VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class,
                ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class,
                SequenceFileOutputFormat.class);
        toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData);
        toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
        task.setCurrentJob(toUserVector).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class,
                CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class,
                CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class);
        countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
        countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
        task.setCurrentJob(countUsers).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath,
                SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class,
                DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class,
                VectorWritable.class, SequenceFileOutputFormat.class);
        maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES,
                maxCooccurrencesPerItem);
        task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true);
    }

    int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);

    /*
     * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
     * new DistributedRowMatrix(...).rowSimilarity(...)
     */
    try {
        ToolRunner.run(getConf(), new RowSimilarityZieOok(),
                new String[] { "-Dmapred.input.dir=" + itemUserMatrixPath,
                        "-Dmapred.output.dir=" + similarityMatrixPath, "--numberOfColumns",
                        String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName,
                        "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem + 1), "--tempDir",
                        tempDirPath.toString() });
    } catch (Exception e) {
        throw new IllegalStateException("item-item-similarity computation failed", e);
    }

    // This step writes the data to a file, we don't want that, it should be written in HBase directly:
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job mostSimilarItems = prepareMostSimilarItems(similarityMatrixPath, outputTable);

        // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();

        // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
        // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);

        // mostSimilarItems.waitForCompletion(true);

        task.setCurrentJob(mostSimilarItems).waitForCompletion(Log.isDebugEnabled());

        // Job mostSimilarItems = prepareJob(similarityMatrixPath, outputPath, SequenceFileInputFormat.class,
        // MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
        // MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class,
        // TextOutputFormat.class);
        // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
        // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString());
        // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
        // mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class);
        // mostSimilarItems.waitForCompletion(true);
    }

    return 0;
}

From source file:nl.gridline.zieook.runners.cf.RecommenderJobZieOok.java

License:Apache License

@Override
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
    addInputOption();/*from w w w .  j  ava 2  s  .  c  o m*/
    addOutputOption();
    addOption("numRecommendations", "n", "Number of recommendations per user",
            String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS));
    addOption("usersFile", "u", "File of users to recommend for", null);
    addOption("itemsFile", "i", "File of items to recommend for", null);
    addOption("filterFile", "f",
            "File containing comma-separated userID,itemID pairs. Used to exclude the item from "
                    + "the recommendations for that user (optional)",
            null);
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("maxPrefsPerUser", "mp",
            "Maximum number of preferences considered per user in final recommendation phase",
            String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this in the similarity computation " + "(default: "
                    + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ",
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM));
    addOption("maxCooccurrencesPerItem", "mo",
            "try to cap the number of cooccurrences per item to this " + "number (default: "
                    + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')',
            String.valueOf(SimilarityType.SIMILARITY_COOCCURRENCE));

    Map<String, String> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path inputPath = getInputPath();
    Path outputPath = getOutputPath();
    Path tempDirPath = new Path(parsedArgs.get("--tempDir"));
    int numRecommendations = Integer.parseInt(parsedArgs.get("--numRecommendations"));
    String usersFile = parsedArgs.get("--usersFile");
    String itemsFile = parsedArgs.get("--itemsFile");
    String filterFile = parsedArgs.get("--filterFile");
    boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
    int maxPrefsPerUser = Integer.parseInt(parsedArgs.get("--maxPrefsPerUser"));
    int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
    int maxSimilaritiesPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
    int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
    String similarityClassname = parsedArgs.get("--similarityClassname");

    Path userVectorPath = new Path(tempDirPath, "userVectors");
    Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex");
    Path countUsersPath = new Path(tempDirPath, "countUsers");
    Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix");
    Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix");
    Path prePartialMultiplyPath1 = new Path(tempDirPath, "prePartialMultiply1");
    Path prePartialMultiplyPath2 = new Path(tempDirPath, "prePartialMultiply2");
    Path explicitFilterPath = new Path(tempDirPath, "explicitFilterPath");
    Path partialMultiplyPath = new Path(tempDirPath, "partialMultiply");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class,
                VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class,
                VarLongWritable.class, SequenceFileOutputFormat.class);
        itemIDIndex.setCombinerClass(ItemIDIndexReducer.class);
        task.setCurrentJob(itemIDIndex).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class,
                VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class,
                ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class,
                SequenceFileOutputFormat.class);
        toUserVector.getConfiguration().setBoolean(BOOLEAN_DATA, booleanData);
        toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser);
        task.setCurrentJob(toUserVector).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class,
                CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class,
                CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class);
        countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
        countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
        task.setCurrentJob(countUsers).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath,
                SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class,
                DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class,
                VectorWritable.class, SequenceFileOutputFormat.class);
        maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES,
                maxCooccurrencesPerItem);
        task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true);
    }

    int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        /*
         * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
         * new DistributedRowMatrix(...).rowSimilarity(...)
         */
        try {
            ToolRunner.run(getConf(), new RowSimilarityZieOok(), new String[] { //
                    "--input", itemUserMatrixPath.toString(), //
                    "--output", similarityMatrixPath.toString(), //
                    "--numberOfColumns", String.valueOf(numberOfUsers), //
                    "--similarityClassname", similarityClassname, //
                    "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem + 1), //
                    "--tempDir", tempDirPath.toString() });
        } catch (Exception e) {
            throw new IllegalStateException("item-item-similarity computation failed", e);
        }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job prePartialMultiply1 = prepareJob(similarityMatrixPath, prePartialMultiplyPath1,
                SequenceFileInputFormat.class, SimilarityMatrixRowWrapperMapper.class, VarIntWritable.class,
                VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class,
                SequenceFileOutputFormat.class);
        task.setCurrentJob(prePartialMultiply1).waitForCompletion(true);

        Job prePartialMultiply2 = prepareJob(userVectorPath, prePartialMultiplyPath2,
                SequenceFileInputFormat.class, UserVectorSplitterMapper.class, VarIntWritable.class,
                VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class,
                SequenceFileOutputFormat.class);
        if (usersFile != null) {
            prePartialMultiply2.getConfiguration().set(UserVectorSplitterMapper.USERS_FILE, usersFile);
        }
        prePartialMultiply2.getConfiguration().setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED,
                maxPrefsPerUser);
        task.setCurrentJob(prePartialMultiply2).waitForCompletion(true);

        Job partialMultiply = prepareJob(new Path(prePartialMultiplyPath1 + "," + prePartialMultiplyPath2),
                partialMultiplyPath, SequenceFileInputFormat.class, Mapper.class, VarIntWritable.class,
                VectorOrPrefWritable.class, ToVectorAndPrefReducer.class, VarIntWritable.class,
                VectorAndPrefsWritable.class, SequenceFileOutputFormat.class);

        /* necessary to make this job (having a combined input path) work on Amazon S3 */
        Configuration partialMultiplyConf = partialMultiply.getConfiguration();
        FileSystem fs = FileSystem.get(tempDirPath.toUri(), partialMultiplyConf);
        prePartialMultiplyPath1 = prePartialMultiplyPath1.makeQualified(fs);
        prePartialMultiplyPath2 = prePartialMultiplyPath2.makeQualified(fs);
        FileInputFormat.setInputPaths(partialMultiply, prePartialMultiplyPath1, prePartialMultiplyPath2);
        task.setCurrentJob(partialMultiply).waitForCompletion(true);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {

        /* convert the user/item pairs to filter if a filterfile has been specified */
        if (filterFile != null) {
            Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class,
                    ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class,
                    ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,
                    SequenceFileOutputFormat.class);
            task.setCurrentJob(itemFiltering).waitForCompletion(true);
        }

        String aggregateAndRecommendInput = partialMultiplyPath.toString();
        if (filterFile != null) {
            aggregateAndRecommendInput += "," + explicitFilterPath;
        }

        Job aggregateAndRecommend = prepareJob(new Path(aggregateAndRecommendInput), outputPath,
                SequenceFileInputFormat.class, PartialMultiplyMapper.class, VarLongWritable.class,
                PrefAndSimilarityColumnWritable.class, AggregateAndRecommendReducer.class,
                VarLongWritable.class, RecommendedItemsWritable.class, SequenceFileOutputFormat.class);
        Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();
        if (itemsFile != null) {
            aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile);
        }

        if (filterFile != null) {
            /* necessary to make this job (having a combined input path) work on Amazon S3 */
            FileSystem fs = FileSystem.get(tempDirPath.toUri(), aggregateAndRecommendConf);
            partialMultiplyPath = partialMultiplyPath.makeQualified(fs);
            explicitFilterPath = explicitFilterPath.makeQualified(fs);
            FileInputFormat.setInputPaths(aggregateAndRecommend, partialMultiplyPath, explicitFilterPath);
        }
        setIOSort(aggregateAndRecommend);
        aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH,
                itemIDIndexPath.toString());
        aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations);
        aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData);
        task.setCurrentJob(aggregateAndRecommend).waitForCompletion(true);
    }

    return 0;
}

From source file:nl.sanoma.hdt.report.generator.ReportGeneratorDriver.java

License:Open Source License

/**
 * Job to join the data and the metadata from distributed cache and
 * calculate the revenue by quarter and most popular product category for user
 *
 * @param dBPath the path of the import MapFile
 * @param inputPath the path of the logs directory
 * @param outputPath the path of the output directory
 * @return returns the exitCode of the job
 * @throws IOException/*from w ww . j  a v  a 2s  . co m*/
 * @throws URISyntaxException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public Boolean generateReport(String dBPath, String inputPath, String outputPath)
        throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
    Job job = new Job(getConf());
    Configuration conf = job.getConfiguration();

    job.setJobName("Repor Generator");
    DistributedCache.addCacheFile(new URI(dBPath), conf);
    job.setJarByClass(ReportGeneratorDriver.class);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.setPartitionerClass(KeyDataPartitioner.class);
    job.setGroupingComparatorClass(KeyDataGroupingComparator.class);
    job.setSortComparatorClass(KeyDataComparator.class);
    job.setMapperClass(ReportGeneratorMapper.class);
    job.setMapOutputKeyClass(KeyData.class);
    job.setMapOutputValueClass(ValueData.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setReducerClass(ReportGeneratorReducer.class);
    job.setNumReduceTasks(1);

    return job.waitForCompletion(true);
}

From source file:nl.utwente.bigdata.shouting.Sorter.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: exampleTwitter <in> [<in>...] <out>");
        System.exit(2);/*from w  w  w . ja v  a2s  .c o m*/
    }
    Job job = new Job(conf, "Sorter");
    job.setJarByClass(Sorter.class);
    job.setMapperClass(MapReducers.SorterMapper.class);
    job.setReducerClass(MapReducers.SorterReducer.class);
    job.setPartitionerClass(MapReducers.SorterPartitioner.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}