Example usage for org.apache.mahout.common HadoopUtil readInt

List of usage examples for org.apache.mahout.common HadoopUtil readInt

Introduction

In this page you can find the example usage for org.apache.mahout.common HadoopUtil readInt.

Prototype

public static int readInt(Path path, Configuration configuration) throws IOException 

Source Link

Usage

From source file:com.pocketx.gravity.recommender.cf.similarity.job.ItemSimilarityJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*  ww w .j a  v a 2 s.co m*/
    addOutputOption();
    addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
            + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
    addOption("maxSimilaritiesPerItem", "m",
            "try to cap the number of similar items per item to this number " + "(default: "
                    + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption("maxPrefsPerUser", "mppu", "max number of preferences to consider per user, "
            + "users with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MAX_PREFS_PER_USER));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE));
    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    String similarityClassName = getOption("similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
    int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser"));
    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));

    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold"))
            : RowSimilarityJob.NO_THRESHOLD;

    Path similarityMatrixPath = getTempPath("similarityMatrix");
    Path prepPath = getTempPath("prepareRatingMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(),
                new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(),
                        "--maxPrefsPerUser", String.valueOf(maxPrefsPerUser), "--minPrefsPerUser",
                        String.valueOf(minPrefsPerUser), "--booleanData", String.valueOf(booleanData),
                        "--tempDir", getTempPath().toString() });
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS),
                getConf());

        ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { "--input",
                new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output",
                similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers),
                "--similarityClassname", similarityClassName, "--maxSimilaritiesPerRow",
                String.valueOf(maxSimilarItemsPerItem), "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
                "--threshold", String.valueOf(threshold), "--tempDir", getTempPath().toString() });
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class,
                MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
                MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class,
                TextOutputFormat.class);
        Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
        mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR,
                new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
        mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
        boolean succeeded = mostSimilarItems.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    return 0;
}

From source file:finderbots.recommenders.hadoop.RecommenderUpdateJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    options = new Options();
    CmdLineParser parser = new CmdLineParser(options);
    String s = options.toString();

    try {/*  ww  w  .jav  a2 s .  c  o m*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        return -1;
    }

    cleanOutputDirs(options);
    Path prefFilesRootDir = new Path(options.getOutputDir());
    FileSystem fs = prefFilesRootDir.getFileSystem(getConf());
    Path indexesPath = new Path(prefFilesRootDir, options.getIndexesDir());
    Path prefsPath = new Path(prefFilesRootDir, options.getPrefsDir());
    options.setPrefsDir(prefsPath.toString());

    // split into actions and store in subdirs
    // create an index/dictionary for users and items
    // this job cleans out the output dir first
    ActionSplitterJob aj = new ActionSplitterJob();
    ToolRunner.run(getConf(), aj,
            new String[] { "--input", options.getInputDir(), "--output", prefsPath.toString(), "--indexDir",
                    indexesPath.toString(), "--inputFilePattern", options.getFileNamePatternString(),
                    "--action1", options.getAction1(), "--action2", options.getAction2(), "--inputDelim",
                    options.getInputDelim(), "--outputDelim", options.getOutputDelim(), "--actionIDCol",
                    Integer.toString(options.getActionColumn()), "--itemIDCol",
                    Integer.toString(options.getItemIDColumn()), "--userIDCol",
                    Integer.toString(options.getUserIDColumn()), });

    // need to get the number of users and items from the splitter, which also creates indexes
    this.numberOfUsers = HadoopUtil.readInt(new Path(indexesPath, aj.getOptions().getNumUsersFile()),
            getConf());
    this.numberOfItems = HadoopUtil.readInt(new Path(indexesPath, aj.getOptions().getNumItemsFile()),
            getConf());
    // these are single value binary files written with
    // HadoopUtil.writeInt(this.numberOfUsers, getOutputPath(NUM_USERS), getConf());

    options.setInputDir(prefFilesRootDir.toString());

    String action1PrefsPath = new Path(new Path(options.getPrefsDir()), aj.getOptions().getAction1Dir())
            .toString();
    String action2PrefsPath = new Path(new Path(options.getPrefsDir()), aj.getOptions().getAction2Dir())
            .toString();

    //LOGGER.info("prefFilesRootDir.toString() = "+prefFilesRootDir.toString());
    //LOGGER.info("options.getPrefsDir() = "+options.getPrefsDir());
    //LOGGER.info("aj.getOptions().getAction1Dir() = "+aj.getOptions().getAction1Dir());
    //LOGGER.info("action1PrefsPath = "+action1PrefsPath.toString());
    //LOGGER.info("action2PrefsPath = "+action2PrefsPath.toString());
    ToolRunner.run(getConf(), new RecommenderJob(),
            new String[] { "--input", action1PrefsPath, "--output", options.getPrimaryRecsPath(),
                    "--similarityClassname", options.getSimilairtyType(),
                    //need the seqfile for the similarity matrix even if output to Solr, this job puts it in the temp dir.
                    "--tempDir", options.getPrimaryTempDir(), "--sequencefileOutput" });
    //Now move the similarity matrix to the p-recs/sims location rather than leaving is in the tmp dir
    //this will be written to Solr if specified in the options.

    if (options.getDoXRecommender()) {
        //note: similairty class is not used, cooccurrence only for now
        ToolRunner.run(getConf(), new XRecommenderJob(),
                new String[] { "--input", options.getAllActionsDir(), "--output",
                        options.getSecondaryOutputDir(), "--similarityClassname", "SIMILARITY_LOGLIKELIHOOD",
                        "--outputPathForSimilarityMatrix", options.getSecondarySimilarityMatrixPath(),
                        "--tempDir", options.getSecondaryTempDir(), "--numUsers",
                        Integer.toString(this.numberOfUsers), "--numItems",
                        Integer.toString(this.numberOfItems), "--primaryPrefs", action1PrefsPath,
                        "--secondaryPrefs", action2PrefsPath, });
    }

    Path bBSimilarityMatrixDRM = new Path(options.getPrimarySimilarityMatrixPath());
    Path bASimilarityMatrixDRM = new Path(options.getSecondarySimilarityMatrixPath());
    Path primaryActionDRM = new Path(new Path(options.getPrimaryTempDir(), RecommenderJob.DEFAULT_PREPARE_PATH),
            PreparePreferenceMatrixJob.USER_VECTORS);
    Path secondaryActionDRM = new Path(
            new Path(options.getSecondaryTempDir(), XRecommenderJob.DEFAULT_PREPARE_DIR),
            PrepareActionMatricesJob.USER_VECTORS_A);

    if (options.getDoXRecommender()) {
        //Next step is to take the history and similarity matrices, join them by id and write to solr docs
        LOGGER.info("\n===========\n\n\n" + "  About to call WriteToSolr with cross-recommendations:\n"
                + "    B matrix path: " + primaryActionDRM.toString() + "\n" + "    A matrix path: "
                + secondaryActionDRM.toString() + "\n" + "    [B'B] matrix path: "
                + bBSimilarityMatrixDRM.toString() + "\n" + "    [B'A] matrix path: "
                + bASimilarityMatrixDRM.toString() + "\n" + "    Output path: " + options.getOutputDir() + "\n"
                + "\n\n===========\n");
        ToolRunner.run(getConf(), new WriteToSolrJob(),
                new String[] { "--itemCrossSimilarityMatrixDir", bASimilarityMatrixDRM.toString(), "--indexDir",
                        indexesPath.toString(), "--itemSimilarityMatrixDir", bBSimilarityMatrixDRM.toString(),
                        "--usersPrimaryHistoryDir", primaryActionDRM.toString(), "--usersSecondaryHistoryDir",
                        secondaryActionDRM.toString(), "--output", options.getOutputDir(), });
    } else {
        LOGGER.info("\n===========\n\n\n" + "  About to call WriteToSolr with single actions recommendations:\n"
                + "    B matrix path: " + primaryActionDRM.toString() + "\n" + "    [B'B] matrix path: "
                + bBSimilarityMatrixDRM.toString() + "\n" + "    Output path: " + options.getOutputDir() + "\n"
                + "\n\n===========\n");
        ToolRunner.run(getConf(), new WriteToSolrJob(),
                new String[] { "--indexDir", indexesPath.toString(), "--itemSimilarityMatrixDir",
                        bBSimilarityMatrixDRM.toString(), "--usersPrimaryHistoryDir",
                        primaryActionDRM.toString(), "--output", options.getOutputDir(), });
    }

    /*
    ToolRunner.run(getConf(), new WriteToSolrJob(), new String[]{
    "--itemCrossSimilarityMatrixDir", "../out/s-recs/sims",
    "--indexDir", "../out/id-indexes",
    "--itemSimilarityMatrixDir", "../out/p-recs/sims",
    "--usersPrimaryHistoryDir", "../out/actions/p-action",
    "--usersSecondaryHistoryDir", "../out/actions/s-action",
    "--output", "../out",
    });
    */

    //move user history and similarity matrices
    //move stuff out of temp for now, may not need all these
    moveMatrices();

    return 0;
}

From source file:hadoop.api.RecommenderJob.java

License:Apache License

/**
 * Calculate the user vector matrix//from  w w  w. j a  v  a 2 s. c  o m
 *
 * @param args Information about the input path, output path, minPrefsPerUser, booleanData, tempDir
 * @return The number of users
 */
public int uservector(String[] args) {
    args = formatArray(args);

    try {
        prepareRecommender(args);
    } catch (IOException e) {
        e.printStackTrace();
    }

    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        try {
            ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(),
                    new String[] { "--input",
                            new Path(prepPath, "wikipediaToCSV/part-r-00000").toUri().toString(), "--output",
                            prepPath.toString(), "--minPrefsPerUser", String.valueOf(minPrefsPerUser),
                            "--booleanData", String.valueOf(booleanData), "--tempDir",
                            prepPath.toUri().toString(), "--outputType", String.valueOf(outputType), });
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    int userVector = -1;

    try {
        userVector = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());

    } catch (IOException e) {
        e.printStackTrace();
    }

    return userVector;
}

From source file:hadoop.api.RecommenderJob.java

License:Apache License

/**
 * Calculate the co-occurrence matrix// w  w w  .j av  a  2 s. c  om
 *
 * @param args          Information about the input path, numberOfColumns, similarityClassname, maxObservationsPerRow
 * @param numberOfUsers Number of Users
 * @return Similarities Per Item
 */
public int rowSimilarity(String[] args, int numberOfUsers) {
    try {
        prepareRecommender(args);
    } catch (IOException e) {
        e.printStackTrace();
    }

    try {
        numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
    } catch (IOException e) {
        e.printStackTrace();
    }

    int maxPrefsInItemSimilarity = Integer.parseInt(getOption("maxPrefsInItemSimilarity"));
    int maxSimilaritiesPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
    String similarityClassname = getOption("similarityClassname");
    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold"))
            : RowSimilarityJob.NO_THRESHOLD;
    long randomSeed = hasOption("randomSeed") ? Long.parseLong(getOption("randomSeed"))
            : RowSimilarityJob.NO_FIXED_RANDOM_SEED;

    try {
        ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { "--input",
                new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output",
                new Path(prepPath, "similarityMatrix").toUri().toString(), "--numberOfColumns",
                String.valueOf(numberOfUsers), "--similarityClassname", similarityClassname,
                "--maxObservationsPerRow", String.valueOf(maxPrefsInItemSimilarity),
                "--maxObservationsPerColumn", String.valueOf(maxPrefsInItemSimilarity),
                "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem), "--excludeSelfSimilarity",
                String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--randomSeed",
                String.valueOf(randomSeed), "--tempDir", prepPath.toString() });
    } catch (Exception e) {
        e.printStackTrace();
    }

    // write out the similarity matrix if the user specified that behavior
    if (hasOption("outputPathForSimilarityMatrix")) {
        Path outputPathForSimilarityMatrix = new Path(getOption("outputPathForSimilarityMatrix"));

        Job outputSimilarityMatrix = null;
        try {
            outputSimilarityMatrix = prepareJob(getTempPath("similarityMatrix"), outputPathForSimilarityMatrix,
                    SequenceFileInputFormat.class, ItemSimilarityJob.MostSimilarItemPairsMapper.class,
                    EntityEntityWritable.class, DoubleWritable.class,
                    ItemSimilarityJob.MostSimilarItemPairsReducer.class, EntityEntityWritable.class,
                    DoubleWritable.class, TextOutputFormat.class);
        } catch (IOException e) {
            e.printStackTrace();
        }

        Configuration mostSimilarItemsConf = outputSimilarityMatrix.getConfiguration();
        mostSimilarItemsConf.set(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR,
                new Path(getTempPath(DEFAULT_PREPARE_PATH), PreparePreferenceMatrixJob.ITEMID_INDEX)
                        .toString());
        mostSimilarItemsConf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem);
        try {
            outputSimilarityMatrix.waitForCompletion(true);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
    }
    return maxSimilaritiesPerItem;
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.item.RecommenderJob.java

License:Apache License

public int run(String[] args) throws Exception {

    addInputOption();//from  ww w .ja v  a  2s.  com
    addOutputOption();
    addOption("numRecommendations", "n", "Number of recommendations per user",
            String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS));
    addOption("usersFile", null, "File of users to recommend for", null);
    addOption("itemsFile", null, "File of items to recommend for", null);
    addOption("filterFile", "f",
            "File containing comma-separated userID,itemID pairs. Used to exclude the item from "
                    + "the recommendations for that user (optional)",
            null);
    addOption("userItemFile", "uif",
            "File containing comma-separated userID,itemID pairs (optional). "
                    + "Used to include only these items into recommendations. "
                    + "Cannot be used together with usersFile or itemsFile",
            null);
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("maxPrefsPerUser", "mxp",
            "Maximum number of preferences considered per user in final recommendation phase",
            String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this in the similarity computation " + "(default: "
                    + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ",
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM));
    addOption("maxPrefsInItemSimilarity", "mpiis",
            "max number of preferences to consider per user or item in the "
                    + "item similarity computation phase, users or items with more preferences will be sampled down (default: "
                    + DEFAULT_MAX_PREFS + ')',
            String.valueOf(DEFAULT_MAX_PREFS));
    addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
            + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')',
            true);
    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
    addOption("outputPathForSimilarityMatrix", "opfsm",
            "write the item similarity matrix to this path (optional)", false);
    addOption("randomSeed", null, "use this seed for sampling", false);
    addFlag("sequencefileOutput", null, "write the output into a SequenceFile instead of a text file");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path outputPath = getOutputPath();
    int numRecommendations = Integer.parseInt(getOption("numRecommendations"));
    String usersFile = getOption("usersFile");
    String itemsFile = getOption("itemsFile");
    String filterFile = getOption("filterFile");
    String userItemFile = getOption("userItemFile");
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));
    int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser"));
    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    int maxPrefsInItemSimilarity = Integer.parseInt(getOption("maxPrefsInItemSimilarity"));
    int maxSimilaritiesPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
    String similarityClassname = getOption("similarityClassname");
    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold"))
            : RowSimilarityJob.NO_THRESHOLD;
    long randomSeed = hasOption("randomSeed") ? Long.parseLong(getOption("randomSeed"))
            : RowSimilarityJob.NO_FIXED_RANDOM_SEED;

    Path prepPath = getTempPath(DEFAULT_PREPARE_PATH);
    Path similarityMatrixPath = getTempPath("similarityMatrix");
    Path explicitFilterPath = getTempPath("explicitFilterPath");
    Path partialMultiplyPath = getTempPath("partialMultiply");

    AtomicInteger currentPhase = new AtomicInteger();

    int numberOfUsers = -1;

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(),
                new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(),
                        "--minPrefsPerUser", String.valueOf(minPrefsPerUser), "--booleanData",
                        String.valueOf(booleanData), "--tempDir", getTempPath().toString(), });

        numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {

        /* special behavior if phase 1 is skipped */
        if (numberOfUsers == -1) {
            numberOfUsers = (int) HadoopUtil.countRecords(
                    new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), PathType.LIST, null,
                    getConf());
        }

        //calculate the co-occurrence matrix
        ToolRunner.run(getConf(), new RowSimilarityJob(),
                new String[] { "--input",
                        new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output",
                        similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers),
                        "--similarityClassname", similarityClassname, "--maxObservationsPerRow",
                        String.valueOf(maxPrefsInItemSimilarity), "--maxObservationsPerColumn",
                        String.valueOf(maxPrefsInItemSimilarity), "--maxSimilaritiesPerRow",
                        String.valueOf(maxSimilaritiesPerItem), "--excludeSelfSimilarity",
                        String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--randomSeed",
                        String.valueOf(randomSeed), "--tempDir", getTempPath().toString(), });

        // write out the similarity matrix if the user specified that behavior
        if (hasOption("outputPathForSimilarityMatrix")) {
            Path outputPathForSimilarityMatrix = new Path(getOption("outputPathForSimilarityMatrix"));

            Job outputSimilarityMatrix = prepareJob(similarityMatrixPath, outputPathForSimilarityMatrix,
                    SequenceFileInputFormat.class, ItemSimilarityJob.MostSimilarItemPairsMapper.class,
                    EntityEntityWritable.class, DoubleWritable.class,
                    ItemSimilarityJob.MostSimilarItemPairsReducer.class, EntityEntityWritable.class,
                    DoubleWritable.class, TextOutputFormat.class);

            Configuration mostSimilarItemsConf = outputSimilarityMatrix.getConfiguration();
            mostSimilarItemsConf.set(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR,
                    new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
            mostSimilarItemsConf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem);
            outputSimilarityMatrix.waitForCompletion(true);
        }
    }

    //start the multiplication of the co-occurrence matrix by the user vectors
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job partialMultiply = new Job(getConf(), "partialMultiply");
        Configuration partialMultiplyConf = partialMultiply.getConfiguration();

        MultipleInputs.addInputPath(partialMultiply, similarityMatrixPath, SequenceFileInputFormat.class,
                SimilarityMatrixRowWrapperMapper.class);
        MultipleInputs.addInputPath(partialMultiply,
                new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), SequenceFileInputFormat.class,
                UserVectorSplitterMapper.class);
        partialMultiply.setJarByClass(ToVectorAndPrefReducer.class);
        partialMultiply.setMapOutputKeyClass(VarIntWritable.class);
        partialMultiply.setMapOutputValueClass(VectorOrPrefWritable.class);
        partialMultiply.setReducerClass(ToVectorAndPrefReducer.class);
        partialMultiply.setOutputFormatClass(SequenceFileOutputFormat.class);
        partialMultiply.setOutputKeyClass(VarIntWritable.class);
        partialMultiply.setOutputValueClass(VectorAndPrefsWritable.class);
        partialMultiplyConf.setBoolean("mapred.compress.map.output", true);
        partialMultiplyConf.set("mapred.output.dir", partialMultiplyPath.toString());

        if (usersFile != null) {
            partialMultiplyConf.set(UserVectorSplitterMapper.USERS_FILE, usersFile);
        }

        if (userItemFile != null) {
            partialMultiplyConf.set(IDReader.USER_ITEM_FILE, userItemFile);
        }

        partialMultiplyConf.setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, maxPrefsPerUser);

        boolean succeeded = partialMultiply.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        //filter out any users we don't care about
        /* convert the user/item pairs to filter if a filterfile has been specified */
        if (filterFile != null) {
            Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class,
                    ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class,
                    ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,
                    SequenceFileOutputFormat.class);
            boolean succeeded = itemFiltering.waitForCompletion(true);
            if (!succeeded) {
                return -1;
            }
        }

        String aggregateAndRecommendInput = partialMultiplyPath.toString();
        if (filterFile != null) {
            aggregateAndRecommendInput += "," + explicitFilterPath;
        }

        Class<? extends OutputFormat> outputFormat = parsedArgs.containsKey("--sequencefileOutput")
                ? SequenceFileOutputFormat.class
                : TextOutputFormat.class;

        //extract out the recommendations
        Job aggregateAndRecommend = prepareJob(new Path(aggregateAndRecommendInput), outputPath,
                SequenceFileInputFormat.class, PartialMultiplyMapper.class, VarLongWritable.class,
                PrefAndSimilarityColumnWritable.class, AggregateAndRecommendReducer.class,
                VarLongWritable.class, RecommendedItemsWritable.class, outputFormat);
        Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();
        if (itemsFile != null) {
            aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile);
        }

        if (userItemFile != null) {
            aggregateAndRecommendConf.set(IDReader.USER_ITEM_FILE, userItemFile);
        }

        if (filterFile != null) {
            setS3SafeCombinedInputPath(aggregateAndRecommend, getTempPath(), partialMultiplyPath,
                    explicitFilterPath);
        }
        setIOSort(aggregateAndRecommend);
        aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH,
                new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
        aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations);
        aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData);
        boolean succeeded = aggregateAndRecommend.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    return 0;
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//from w w  w  .j a  va 2  s.c  o m
    addOutputOption();
    addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
            + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
    addOption("maxSimilaritiesPerItem", "m",
            "try to cap the number of similar items per item to this number " + "(default: "
                    + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption("maxPrefs", "mppu", "max number of preferences to consider per user or item, "
            + "users or items with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS + ')',
            String.valueOf(DEFAULT_MAX_PREFS));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE));
    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
    addOption("randomSeed", null, "use this seed for sampling", false);

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    String similarityClassName = getOption("similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
    int maxPrefs = Integer.parseInt(getOption("maxPrefs"));
    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));

    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold"))
            : RowSimilarityJob.NO_THRESHOLD;
    long randomSeed = hasOption("randomSeed") ? Long.parseLong(getOption("randomSeed"))
            : RowSimilarityJob.NO_FIXED_RANDOM_SEED;

    Path similarityMatrixPath = getTempPath("similarityMatrix");
    Path prepPath = getTempPath("prepareRatingMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(),
                new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(),
                        "--minPrefsPerUser", String.valueOf(minPrefsPerUser), "--booleanData",
                        String.valueOf(booleanData), "--tempDir", getTempPath().toString(), });
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS),
                getConf());

        ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { "--input",
                new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output",
                similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers),
                "--similarityClassname", similarityClassName, "--maxObservationsPerRow",
                String.valueOf(maxPrefs), "--maxObservationsPerColumn", String.valueOf(maxPrefs),
                "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem), "--excludeSelfSimilarity",
                String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--randomSeed",
                String.valueOf(randomSeed), "--tempDir", getTempPath().toString(), });
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class,
                MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
                MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class,
                TextOutputFormat.class);
        Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
        mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR,
                new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
        mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
        boolean succeeded = mostSimilarItems.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    return 0;
}

From source file:org.hf.mls.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//from ww w .java2 s  . c  o  m
    addOutputOption();
    addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
            + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
    addOption("maxSimilaritiesPerItem", "m",
            "try to cap the number of similar items per item to this number " + "(default: "
                    + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption("maxPrefsPerUser", "mppu", "max number of preferences to consider per user, "
            + "users with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MAX_PREFS_PER_USER));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE));
    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    String similarityClassName = getOption("similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
    int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser"));
    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));

    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold"))
            : RowSimilarityJob.NO_THRESHOLD;

    Path similarityMatrixPath = getTempPath("similarityMatrix");
    Path prepPath = getTempPath("prepareRatingMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(),
                new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(),
                        "--maxPrefsPerUser", String.valueOf(maxPrefsPerUser), "--minPrefsPerUser",
                        String.valueOf(minPrefsPerUser), "--booleanData", String.valueOf(booleanData),
                        "--tempDir", getTempPath().toString(), });
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS),
                getConf());

        ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { "--input",
                new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output",
                similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers),
                "--similarityClassname", similarityClassName, "--maxSimilaritiesPerRow",
                String.valueOf(maxSimilarItemsPerItem), "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
                "--threshold", String.valueOf(threshold), "--tempDir", getTempPath().toString(), });
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class,
                MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
                MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class,
                TextOutputFormat.class);
        Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
        mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR,
                new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
        mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
        boolean succeeded = mostSimilarItems.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    return 0;
}