Example usage for org.apache.mahout.math.hadoop.similarity.cooccurrence.measures VectorSimilarityMeasures list

Introduction

In this page you can find the example usage for org.apache.mahout.math.hadoop.similarity.cooccurrence.measures VectorSimilarityMeasures list.

Prototype

public static String list()

Source Link

Usage

From source file:com.pocketx.gravity.recommender.cf.similarity.job.ItemSimilarityJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//from   ww  w  .  ja  v  a2 s .  c  o m
    addOutputOption();
    addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
            + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
    addOption("maxSimilaritiesPerItem", "m",
            "try to cap the number of similar items per item to this number " + "(default: "
                    + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption("maxPrefsPerUser", "mppu", "max number of preferences to consider per user, "
            + "users with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MAX_PREFS_PER_USER));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE));
    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    String similarityClassName = getOption("similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
    int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser"));
    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));

    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold"))
            : RowSimilarityJob.NO_THRESHOLD;

    Path similarityMatrixPath = getTempPath("similarityMatrix");
    Path prepPath = getTempPath("prepareRatingMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(),
                new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(),
                        "--maxPrefsPerUser", String.valueOf(maxPrefsPerUser), "--minPrefsPerUser",
                        String.valueOf(minPrefsPerUser), "--booleanData", String.valueOf(booleanData),
                        "--tempDir", getTempPath().toString() });
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS),
                getConf());

        ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { "--input",
                new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output",
                similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers),
                "--similarityClassname", similarityClassName, "--maxSimilaritiesPerRow",
                String.valueOf(maxSimilarItemsPerItem), "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
                "--threshold", String.valueOf(threshold), "--tempDir", getTempPath().toString() });
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class,
                MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
                MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class,
                TextOutputFormat.class);
        Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
        mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR,
                new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
        mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
        boolean succeeded = mostSimilarItems.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    return 0;
}

From source file:com.pocketx.gravity.recommender.cf.similarity.job.RowSimilarityJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*from  ww w  . j  a v a 2s  .c  o  m*/
    addOutputOption();
    addOption("numberOfColumns", "r", "Number of columns in the input matrix", false);
    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
    addOption("maxSimilaritiesPerRow", "m",
            "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')',
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));
    addOption("excludeSelfSimilarity", "ess", "compute similarity of rows to themselves?",
            String.valueOf(false));
    addOption("threshold", "tr", "discard row pairs with a similarity value below this", false);
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int numberOfColumns;

    if (hasOption("numberOfColumns")) {
        // Number of columns explicitly specified via CLI
        numberOfColumns = Integer.parseInt(getOption("numberOfColumns"));
    } else {
        // else get the number of columns by determining the cardinality of a vector in the input matrix
        numberOfColumns = getDimensions(getInputPath());
    }

    String similarityClassnameArg = getOption("similarityClassname");
    String similarityClassname;
    try {
        similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname();
    } catch (IllegalArgumentException iae) {
        similarityClassname = similarityClassnameArg;
    }

    // Clear the output and temp paths if the overwrite option has been set
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        // Clear the temp path
        HadoopUtil.delete(getConf(), getTempPath());
        // Clear the output path
        HadoopUtil.delete(getConf(), getOutputPath());
    }

    int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow"));
    boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity"));
    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD;

    Path weightsPath = getTempPath("weights");
    Path normsPath = getTempPath("norms.bin");
    Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin");
    Path maxValuesPath = getTempPath("maxValues.bin");
    Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job normsAndTranspose = prepareJob(getInputPath(), weightsPath, VectorNormMapper.class,
                IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class,
                VectorWritable.class);
        normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class);
        Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration();
        normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold));
        normsAndTransposeConf.set(NORMS_PATH, normsPath.toString());
        normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
        normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString());
        normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname);
        boolean succeeded = normsAndTranspose.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, CooccurrencesMapper.class,
                IntWritable.class, VectorWritable.class, SimilarityReducer.class, IntWritable.class,
                VectorWritable.class);
        pairwiseSimilarity.setCombinerClass(VectorSumReducer.class);
        Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
        pairwiseConf.set(THRESHOLD, String.valueOf(threshold));
        pairwiseConf.set(NORMS_PATH, normsPath.toString());
        pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
        pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString());
        pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname);
        pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns);
        pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity);
        boolean succeeded = pairwiseSimilarity.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job asMatrix = prepareJob(pairwiseSimilarityPath, getOutputPath(), UnsymmetrifyMapper.class,
                IntWritable.class, VectorWritable.class, MergeToTopKSimilaritiesReducer.class,
                IntWritable.class, VectorWritable.class);
        asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class);
        asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
        boolean succeeded = asMatrix.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    return 0;
}

From source file:hadoop.api.RecommenderJob.java

License:Apache License

/**
 * Get the args and set the Option//from  www . j  a  v  a  2s .com
 *
 * @param args Information about the input path, output path, booleanData, and similarity
 * @return Number of Recommendations
 * @throws IOException
 */
public int prepareRecommender(String[] args) throws IOException {
    addInputOption();
    addOutputOption();
    addOption("numRecommendations", "n", "Number of recommendations per user",
            String.valueOf(hadoop.api.AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS));
    addOption("usersFile", null, "File of users to recommend for", null);
    addOption("itemsFile", null, "File of items to recommend for", null);
    addOption("filterFile", "f",
            "File containing comma-separated userID,itemID pairs. Used to exclude the item from "
                    + "the recommendations for that user (optional)",
            null);
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("maxPrefsPerUser", "mxp",
            "Maximum number of preferences considered per user in final recommendation phase",
            String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this in the similarity computation " + "(default: "
                    + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ",
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM));
    addOption("maxPrefsInItemSimilarity", "mpiis",
            "max number of preferences to consider per user or item in the "
                    + "item similarity computation phase, users or items with more preferences will be sampled down (default: "
                    + DEFAULT_MAX_PREFS + ')',
            String.valueOf(DEFAULT_MAX_PREFS));
    addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
            + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')',
            true);
    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
    addOption("outputPathForSimilarityMatrix", "opfsm",
            "write the item similarity matrix to this path (optional)", false);
    addOption("randomSeed", null, "use this seed for sampling", false);
    //addOption("outputType", "ot", "Output Type", "TextOutputFormat");
    addFlag("sequencefileOutput", null, "write the output into a SequenceFile instead of a text file");

    parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    int numRecommendations = Integer.parseInt(getOption("numRecommendations"));

    Path explicitFilterPath = getTempPath("explicitFilterPath");

    return numRecommendations;
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.item.RecommenderJob.java

License:Apache License

public int run(String[] args) throws Exception {

    addInputOption();/*from w w  w .  ja  v  a  2  s .c o m*/
    addOutputOption();
    addOption("numRecommendations", "n", "Number of recommendations per user",
            String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS));
    addOption("usersFile", null, "File of users to recommend for", null);
    addOption("itemsFile", null, "File of items to recommend for", null);
    addOption("filterFile", "f",
            "File containing comma-separated userID,itemID pairs. Used to exclude the item from "
                    + "the recommendations for that user (optional)",
            null);
    addOption("userItemFile", "uif",
            "File containing comma-separated userID,itemID pairs (optional). "
                    + "Used to include only these items into recommendations. "
                    + "Cannot be used together with usersFile or itemsFile",
            null);
    addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString());
    addOption("maxPrefsPerUser", "mxp",
            "Maximum number of preferences considered per user in final recommendation phase",
            String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this in the similarity computation " + "(default: "
                    + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ",
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM));
    addOption("maxPrefsInItemSimilarity", "mpiis",
            "max number of preferences to consider per user or item in the "
                    + "item similarity computation phase, users or items with more preferences will be sampled down (default: "
                    + DEFAULT_MAX_PREFS + ')',
            String.valueOf(DEFAULT_MAX_PREFS));
    addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
            + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')',
            true);
    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
    addOption("outputPathForSimilarityMatrix", "opfsm",
            "write the item similarity matrix to this path (optional)", false);
    addOption("randomSeed", null, "use this seed for sampling", false);
    addFlag("sequencefileOutput", null, "write the output into a SequenceFile instead of a text file");

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    Path outputPath = getOutputPath();
    int numRecommendations = Integer.parseInt(getOption("numRecommendations"));
    String usersFile = getOption("usersFile");
    String itemsFile = getOption("itemsFile");
    String filterFile = getOption("filterFile");
    String userItemFile = getOption("userItemFile");
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));
    int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser"));
    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    int maxPrefsInItemSimilarity = Integer.parseInt(getOption("maxPrefsInItemSimilarity"));
    int maxSimilaritiesPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
    String similarityClassname = getOption("similarityClassname");
    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold"))
            : RowSimilarityJob.NO_THRESHOLD;
    long randomSeed = hasOption("randomSeed") ? Long.parseLong(getOption("randomSeed"))
            : RowSimilarityJob.NO_FIXED_RANDOM_SEED;

    Path prepPath = getTempPath(DEFAULT_PREPARE_PATH);
    Path similarityMatrixPath = getTempPath("similarityMatrix");
    Path explicitFilterPath = getTempPath("explicitFilterPath");
    Path partialMultiplyPath = getTempPath("partialMultiply");

    AtomicInteger currentPhase = new AtomicInteger();

    int numberOfUsers = -1;

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(),
                new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(),
                        "--minPrefsPerUser", String.valueOf(minPrefsPerUser), "--booleanData",
                        String.valueOf(booleanData), "--tempDir", getTempPath().toString(), });

        numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {

        /* special behavior if phase 1 is skipped */
        if (numberOfUsers == -1) {
            numberOfUsers = (int) HadoopUtil.countRecords(
                    new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), PathType.LIST, null,
                    getConf());
        }

        //calculate the co-occurrence matrix
        ToolRunner.run(getConf(), new RowSimilarityJob(),
                new String[] { "--input",
                        new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output",
                        similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers),
                        "--similarityClassname", similarityClassname, "--maxObservationsPerRow",
                        String.valueOf(maxPrefsInItemSimilarity), "--maxObservationsPerColumn",
                        String.valueOf(maxPrefsInItemSimilarity), "--maxSimilaritiesPerRow",
                        String.valueOf(maxSimilaritiesPerItem), "--excludeSelfSimilarity",
                        String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--randomSeed",
                        String.valueOf(randomSeed), "--tempDir", getTempPath().toString(), });

        // write out the similarity matrix if the user specified that behavior
        if (hasOption("outputPathForSimilarityMatrix")) {
            Path outputPathForSimilarityMatrix = new Path(getOption("outputPathForSimilarityMatrix"));

            Job outputSimilarityMatrix = prepareJob(similarityMatrixPath, outputPathForSimilarityMatrix,
                    SequenceFileInputFormat.class, ItemSimilarityJob.MostSimilarItemPairsMapper.class,
                    EntityEntityWritable.class, DoubleWritable.class,
                    ItemSimilarityJob.MostSimilarItemPairsReducer.class, EntityEntityWritable.class,
                    DoubleWritable.class, TextOutputFormat.class);

            Configuration mostSimilarItemsConf = outputSimilarityMatrix.getConfiguration();
            mostSimilarItemsConf.set(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR,
                    new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
            mostSimilarItemsConf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem);
            outputSimilarityMatrix.waitForCompletion(true);
        }
    }

    //start the multiplication of the co-occurrence matrix by the user vectors
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job partialMultiply = new Job(getConf(), "partialMultiply");
        Configuration partialMultiplyConf = partialMultiply.getConfiguration();

        MultipleInputs.addInputPath(partialMultiply, similarityMatrixPath, SequenceFileInputFormat.class,
                SimilarityMatrixRowWrapperMapper.class);
        MultipleInputs.addInputPath(partialMultiply,
                new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), SequenceFileInputFormat.class,
                UserVectorSplitterMapper.class);
        partialMultiply.setJarByClass(ToVectorAndPrefReducer.class);
        partialMultiply.setMapOutputKeyClass(VarIntWritable.class);
        partialMultiply.setMapOutputValueClass(VectorOrPrefWritable.class);
        partialMultiply.setReducerClass(ToVectorAndPrefReducer.class);
        partialMultiply.setOutputFormatClass(SequenceFileOutputFormat.class);
        partialMultiply.setOutputKeyClass(VarIntWritable.class);
        partialMultiply.setOutputValueClass(VectorAndPrefsWritable.class);
        partialMultiplyConf.setBoolean("mapred.compress.map.output", true);
        partialMultiplyConf.set("mapred.output.dir", partialMultiplyPath.toString());

        if (usersFile != null) {
            partialMultiplyConf.set(UserVectorSplitterMapper.USERS_FILE, usersFile);
        }

        if (userItemFile != null) {
            partialMultiplyConf.set(IDReader.USER_ITEM_FILE, userItemFile);
        }

        partialMultiplyConf.setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, maxPrefsPerUser);

        boolean succeeded = partialMultiply.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        //filter out any users we don't care about
        /* convert the user/item pairs to filter if a filterfile has been specified */
        if (filterFile != null) {
            Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class,
                    ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class,
                    ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class,
                    SequenceFileOutputFormat.class);
            boolean succeeded = itemFiltering.waitForCompletion(true);
            if (!succeeded) {
                return -1;
            }
        }

        String aggregateAndRecommendInput = partialMultiplyPath.toString();
        if (filterFile != null) {
            aggregateAndRecommendInput += "," + explicitFilterPath;
        }

        Class<? extends OutputFormat> outputFormat = parsedArgs.containsKey("--sequencefileOutput")
                ? SequenceFileOutputFormat.class
                : TextOutputFormat.class;

        //extract out the recommendations
        Job aggregateAndRecommend = prepareJob(new Path(aggregateAndRecommendInput), outputPath,
                SequenceFileInputFormat.class, PartialMultiplyMapper.class, VarLongWritable.class,
                PrefAndSimilarityColumnWritable.class, AggregateAndRecommendReducer.class,
                VarLongWritable.class, RecommendedItemsWritable.class, outputFormat);
        Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration();
        if (itemsFile != null) {
            aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile);
        }

        if (userItemFile != null) {
            aggregateAndRecommendConf.set(IDReader.USER_ITEM_FILE, userItemFile);
        }

        if (filterFile != null) {
            setS3SafeCombinedInputPath(aggregateAndRecommend, getTempPath(), partialMultiplyPath,
                    explicitFilterPath);
        }
        setIOSort(aggregateAndRecommend);
        aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH,
                new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
        aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations);
        aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData);
        boolean succeeded = aggregateAndRecommend.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    return 0;
}

From source file:org.gpfvic.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();/*w  w w .  ja va 2  s. c  o  m*/
    addOutputOption();
    addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
            + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
    addOption("maxSimilaritiesPerItem", "m",
            "try to cap the number of similar items per item to this number " + "(default: "
                    + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption("maxPrefs", "mppu", "max number of preferences to consider per user or item, "
            + "users or items with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS + ')',
            String.valueOf(DEFAULT_MAX_PREFS));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE));
    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
    addOption("randomSeed", null, "use this seed for sampling", false);

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    String similarityClassName = getOption("similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
    int maxPrefs = Integer.parseInt(getOption("maxPrefs"));
    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));

    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold"))
            : RowSimilarityJob.NO_THRESHOLD;
    long randomSeed = hasOption("randomSeed") ? Long.parseLong(getOption("randomSeed"))
            : RowSimilarityJob.NO_FIXED_RANDOM_SEED;

    Path similarityMatrixPath = getTempPath("similarityMatrix");
    Path prepPath = getTempPath("prepareRatingMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(),
                new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(),
                        "--minPrefsPerUser", String.valueOf(minPrefsPerUser), "--booleanData",
                        String.valueOf(booleanData), "--tempDir", getTempPath().toString(), });
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS),
                getConf());

        ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { "--input",
                new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output",
                similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers),
                "--similarityClassname", similarityClassName, "--maxObservationsPerRow",
                String.valueOf(maxPrefs), "--maxObservationsPerColumn", String.valueOf(maxPrefs),
                "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem), "--excludeSelfSimilarity",
                String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--randomSeed",
                String.valueOf(randomSeed), "--tempDir", getTempPath().toString(), });
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class,
                MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
                MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class,
                TextOutputFormat.class);
        Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
        mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR,
                new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
        mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
        boolean succeeded = mostSimilarItems.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    return 0;
}

From source file:org.hf.mls.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    addInputOption();//from   w  w w. j  a  v a2s  . c  o  m
    addOutputOption();
    addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, "
            + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
    addOption("maxSimilaritiesPerItem", "m",
            "try to cap the number of similar items per item to this number " + "(default: "
                    + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
            String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
    addOption("maxPrefsPerUser", "mppu", "max number of preferences to consider per user, "
            + "users with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MAX_PREFS_PER_USER));
    addOption("minPrefsPerUser", "mp",
            "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
            String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
    addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE));
    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }

    String similarityClassName = getOption("similarityClassname");
    int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem"));
    int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser"));
    int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser"));
    boolean booleanData = Boolean.valueOf(getOption("booleanData"));

    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold"))
            : RowSimilarityJob.NO_THRESHOLD;

    Path similarityMatrixPath = getTempPath("similarityMatrix");
    Path prepPath = getTempPath("prepareRatingMatrix");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(),
                new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(),
                        "--maxPrefsPerUser", String.valueOf(maxPrefsPerUser), "--minPrefsPerUser",
                        String.valueOf(minPrefsPerUser), "--booleanData", String.valueOf(booleanData),
                        "--tempDir", getTempPath().toString(), });
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS),
                getConf());

        ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { "--input",
                new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output",
                similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers),
                "--similarityClassname", similarityClassName, "--maxSimilaritiesPerRow",
                String.valueOf(maxSimilarItemsPerItem), "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
                "--threshold", String.valueOf(threshold), "--tempDir", getTempPath().toString(), });
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class,
                MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
                MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class,
                TextOutputFormat.class);
        Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
        mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR,
                new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
        mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
        boolean succeeded = mostSimilarItems.waitForCompletion(true);
        if (!succeeded) {
            return -1;
        }
    }

    return 0;
}

From source file:org.hf.mls.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob.java

License:Apache License

public Map<String, ControlledJob> getJobs(String[] args) throws Exception {
    Map<String, ControlledJob> cJobs = new HashMap<String, ControlledJob>();
    ControlledJob cNormsAndTranspose = null;
    ControlledJob cPairwiseSimilarity = null;
    ControlledJob cAsMatrix = null;/* w w  w .  j  a  v  a  2  s  .  c o  m*/

    addInputOption();
    addOutputOption();
    addOption("numberOfColumns", "r", "Number of columns in the input matrix", false);
    addOption("similarityClassname", "s",
            "Name of distributed similarity class to instantiate, alternatively use "
                    + "one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
    addOption("maxSimilaritiesPerRow", "m",
            "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')',
            String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));
    addOption("excludeSelfSimilarity", "ess", "compute similarity of rows to themselves?",
            String.valueOf(false));
    addOption("threshold", "tr", "discard row pairs with a similarity value below this", false);
    addOption(DefaultOptionCreator.overwriteOption().create());

    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return null;
    }

    String similarityClassnameArg = getOption("similarityClassname");
    String similarityClassname;
    try {
        similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname();
    } catch (IllegalArgumentException iae) {
        similarityClassname = similarityClassnameArg;
    }

    // Clear the output and temp paths if the overwrite option has been set
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        // Clear the temp path
        HadoopUtil.delete(getConf(), getTempPath());
        // Clear the output path
        HadoopUtil.delete(getConf(), getOutputPath());
    }

    int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow"));
    boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity"));
    double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD;

    Path weightsPath = getTempPath("weights");
    Path normsPath = getTempPath("norms.bin");
    Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin");
    Path maxValuesPath = getTempPath("maxValues.bin");
    Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity");

    AtomicInteger currentPhase = new AtomicInteger();

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job normsAndTranspose = prepareJob(new Path(getInputPath(), "ratingMatrix"), weightsPath,
                VectorNormMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class,
                IntWritable.class, VectorWritable.class);
        normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class);
        Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration();
        normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold));
        normsAndTransposeConf.set(NORMS_PATH, normsPath.toString());
        normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
        normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString());
        normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname);
        /**
         * this job is depending the last job --- countObservations
         */
        cNormsAndTranspose = new ControlledJob(new Configuration());
        cNormsAndTranspose.setJob(normsAndTranspose);
        cJobs.put("normsAndTranspose", cNormsAndTranspose);
    }
    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, CooccurrencesMapper.class,
                IntWritable.class, VectorWritable.class, SimilarityReducer.class, IntWritable.class,
                VectorWritable.class);
        pairwiseSimilarity.setCombinerClass(VectorSumReducer.class);
        Configuration pairwiseConf = pairwiseSimilarity.getConfiguration();
        pairwiseConf.set(THRESHOLD, String.valueOf(threshold));
        pairwiseConf.set(NORMS_PATH, normsPath.toString());
        pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString());
        pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString());
        pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname);
        //add prePath
        pairwiseConf.set("prepPath", getInputPath().toString());
        pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity);
        /**
         * depending on normsAndTranspose job
         */
        cPairwiseSimilarity = new ControlledJob(new Configuration());
        cPairwiseSimilarity.setJob(pairwiseSimilarity);
        if (null != cNormsAndTranspose) {
            cPairwiseSimilarity.addDependingJob(cNormsAndTranspose);
        }
        cJobs.put("pairwiseSimilarity", cPairwiseSimilarity);
    }

    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
        Job asMatrix = prepareJob(pairwiseSimilarityPath, getOutputPath(), UnsymmetrifyMapper.class,
                IntWritable.class, VectorWritable.class, MergeToTopKSimilaritiesReducer.class,
                IntWritable.class, VectorWritable.class);
        asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class);
        asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow);
        /**
         * depending on pairwiseSimilarity job
         */
        cAsMatrix = new ControlledJob(new Configuration());
        cAsMatrix.setJob(asMatrix);
        if (null != cPairwiseSimilarity) {
            cAsMatrix.addDependingJob(cPairwiseSimilarity);
        }
        cJobs.put("asMatrix", cAsMatrix);
    }

    return cJobs;
}