List of usage examples for org.apache.mahout.math.hadoop.similarity.cooccurrence.measures VectorSimilarityMeasures list
public static String list()
From source file:com.pocketx.gravity.recommender.cf.similarity.job.ItemSimilarityJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from ww w . ja v a2 s . c o m addOutputOption(); addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, " + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')'); addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number " + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM)); addOption("maxPrefsPerUser", "mppu", "max number of preferences to consider per user, " + "users with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MAX_PREFS_PER_USER)); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE)); addOption("threshold", "tr", "discard item pairs with a similarity value below this", false); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } String similarityClassName = getOption("similarityClassname"); int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem")); int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser")); int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser")); boolean booleanData = Boolean.valueOf(getOption("booleanData")); double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD; Path similarityMatrixPath = getTempPath("similarityMatrix"); Path prepPath = getTempPath("prepareRatingMatrix"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(), "--maxPrefsPerUser", String.valueOf(maxPrefsPerUser), "--minPrefsPerUser", String.valueOf(minPrefsPerUser), "--booleanData", String.valueOf(booleanData), "--tempDir", getTempPath().toString() }); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf()); ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output", similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName, "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem), "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--tempDir", getTempPath().toString() }); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class, MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class, MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class); Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration(); mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString()); mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem); boolean succeeded = mostSimilarItems.waitForCompletion(true); if (!succeeded) { return -1; } } return 0; }
From source file:com.pocketx.gravity.recommender.cf.similarity.job.RowSimilarityJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*from ww w . j a v a 2s .c o m*/ addOutputOption(); addOption("numberOfColumns", "r", "Number of columns in the input matrix", false); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')'); addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW)); addOption("excludeSelfSimilarity", "ess", "compute similarity of rows to themselves?", String.valueOf(false)); addOption("threshold", "tr", "discard row pairs with a similarity value below this", false); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int numberOfColumns; if (hasOption("numberOfColumns")) { // Number of columns explicitly specified via CLI numberOfColumns = Integer.parseInt(getOption("numberOfColumns")); } else { // else get the number of columns by determining the cardinality of a vector in the input matrix numberOfColumns = getDimensions(getInputPath()); } String similarityClassnameArg = getOption("similarityClassname"); String similarityClassname; try { similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname(); } catch (IllegalArgumentException iae) { similarityClassname = similarityClassnameArg; } // Clear the output and temp paths if the overwrite option has been set if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { // Clear the temp path HadoopUtil.delete(getConf(), getTempPath()); // Clear the output path HadoopUtil.delete(getConf(), getOutputPath()); } int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow")); boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity")); double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD; Path weightsPath = getTempPath("weights"); Path normsPath = getTempPath("norms.bin"); Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin"); Path maxValuesPath = getTempPath("maxValues.bin"); Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job normsAndTranspose = prepareJob(getInputPath(), weightsPath, VectorNormMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class); Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration(); normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold)); normsAndTransposeConf.set(NORMS_PATH, normsPath.toString()); normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString()); normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString()); normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname); boolean succeeded = normsAndTranspose.waitForCompletion(true); if (!succeeded) { return -1; } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, CooccurrencesMapper.class, IntWritable.class, VectorWritable.class, SimilarityReducer.class, IntWritable.class, VectorWritable.class); pairwiseSimilarity.setCombinerClass(VectorSumReducer.class); Configuration pairwiseConf = pairwiseSimilarity.getConfiguration(); pairwiseConf.set(THRESHOLD, String.valueOf(threshold)); pairwiseConf.set(NORMS_PATH, normsPath.toString()); pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString()); pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString()); pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname); pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns); pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity); boolean succeeded = pairwiseSimilarity.waitForCompletion(true); if (!succeeded) { return -1; } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job asMatrix = prepareJob(pairwiseSimilarityPath, getOutputPath(), UnsymmetrifyMapper.class, IntWritable.class, VectorWritable.class, MergeToTopKSimilaritiesReducer.class, IntWritable.class, VectorWritable.class); asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class); asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow); boolean succeeded = asMatrix.waitForCompletion(true); if (!succeeded) { return -1; } } return 0; }
From source file:hadoop.api.RecommenderJob.java
License:Apache License
/** * Get the args and set the Option//from www . j a v a 2s .com * * @param args Information about the input path, output path, booleanData, and similarity * @return Number of Recommendations * @throws IOException */ public int prepareRecommender(String[] args) throws IOException { addInputOption(); addOutputOption(); addOption("numRecommendations", "n", "Number of recommendations per user", String.valueOf(hadoop.api.AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS)); addOption("usersFile", null, "File of users to recommend for", null); addOption("itemsFile", null, "File of items to recommend for", null); addOption("filterFile", "f", "File containing comma-separated userID,itemID pairs. Used to exclude the item from " + "the recommendations for that user (optional)", null); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); addOption("maxPrefsPerUser", "mxp", "Maximum number of preferences considered per user in final recommendation phase", String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED)); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this in the similarity computation " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ", String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM)); addOption("maxPrefsInItemSimilarity", "mpiis", "max number of preferences to consider per user or item in the " + "item similarity computation phase, users or items with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS + ')', String.valueOf(DEFAULT_MAX_PREFS)); addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, " + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')', true); addOption("threshold", "tr", "discard item pairs with a similarity value below this", false); addOption("outputPathForSimilarityMatrix", "opfsm", "write the item similarity matrix to this path (optional)", false); addOption("randomSeed", null, "use this seed for sampling", false); //addOption("outputType", "ot", "Output Type", "TextOutputFormat"); addFlag("sequencefileOutput", null, "write the output into a SequenceFile instead of a text file"); parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int numRecommendations = Integer.parseInt(getOption("numRecommendations")); Path explicitFilterPath = getTempPath("explicitFilterPath"); return numRecommendations; }
From source file:org.gpfvic.mahout.cf.taste.hadoop.item.RecommenderJob.java
License:Apache License
public int run(String[] args) throws Exception { addInputOption();/*from w w w . ja v a 2 s .c o m*/ addOutputOption(); addOption("numRecommendations", "n", "Number of recommendations per user", String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS)); addOption("usersFile", null, "File of users to recommend for", null); addOption("itemsFile", null, "File of items to recommend for", null); addOption("filterFile", "f", "File containing comma-separated userID,itemID pairs. Used to exclude the item from " + "the recommendations for that user (optional)", null); addOption("userItemFile", "uif", "File containing comma-separated userID,itemID pairs (optional). " + "Used to include only these items into recommendations. " + "Cannot be used together with usersFile or itemsFile", null); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); addOption("maxPrefsPerUser", "mxp", "Maximum number of preferences considered per user in final recommendation phase", String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED)); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this in the similarity computation " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ", String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM)); addOption("maxPrefsInItemSimilarity", "mpiis", "max number of preferences to consider per user or item in the " + "item similarity computation phase, users or items with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS + ')', String.valueOf(DEFAULT_MAX_PREFS)); addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, " + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')', true); addOption("threshold", "tr", "discard item pairs with a similarity value below this", false); addOption("outputPathForSimilarityMatrix", "opfsm", "write the item similarity matrix to this path (optional)", false); addOption("randomSeed", null, "use this seed for sampling", false); addFlag("sequencefileOutput", null, "write the output into a SequenceFile instead of a text file"); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path outputPath = getOutputPath(); int numRecommendations = Integer.parseInt(getOption("numRecommendations")); String usersFile = getOption("usersFile"); String itemsFile = getOption("itemsFile"); String filterFile = getOption("filterFile"); String userItemFile = getOption("userItemFile"); boolean booleanData = Boolean.valueOf(getOption("booleanData")); int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser")); int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser")); int maxPrefsInItemSimilarity = Integer.parseInt(getOption("maxPrefsInItemSimilarity")); int maxSimilaritiesPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem")); String similarityClassname = getOption("similarityClassname"); double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD; long randomSeed = hasOption("randomSeed") ? Long.parseLong(getOption("randomSeed")) : RowSimilarityJob.NO_FIXED_RANDOM_SEED; Path prepPath = getTempPath(DEFAULT_PREPARE_PATH); Path similarityMatrixPath = getTempPath("similarityMatrix"); Path explicitFilterPath = getTempPath("explicitFilterPath"); Path partialMultiplyPath = getTempPath("partialMultiply"); AtomicInteger currentPhase = new AtomicInteger(); int numberOfUsers = -1; if (shouldRunNextPhase(parsedArgs, currentPhase)) { ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(), "--minPrefsPerUser", String.valueOf(minPrefsPerUser), "--booleanData", String.valueOf(booleanData), "--tempDir", getTempPath().toString(), }); numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf()); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* special behavior if phase 1 is skipped */ if (numberOfUsers == -1) { numberOfUsers = (int) HadoopUtil.countRecords( new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), PathType.LIST, null, getConf()); } //calculate the co-occurrence matrix ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output", similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers), "--similarityClassname", similarityClassname, "--maxObservationsPerRow", String.valueOf(maxPrefsInItemSimilarity), "--maxObservationsPerColumn", String.valueOf(maxPrefsInItemSimilarity), "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem), "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--randomSeed", String.valueOf(randomSeed), "--tempDir", getTempPath().toString(), }); // write out the similarity matrix if the user specified that behavior if (hasOption("outputPathForSimilarityMatrix")) { Path outputPathForSimilarityMatrix = new Path(getOption("outputPathForSimilarityMatrix")); Job outputSimilarityMatrix = prepareJob(similarityMatrixPath, outputPathForSimilarityMatrix, SequenceFileInputFormat.class, ItemSimilarityJob.MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class, ItemSimilarityJob.MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class); Configuration mostSimilarItemsConf = outputSimilarityMatrix.getConfiguration(); mostSimilarItemsConf.set(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR, new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString()); mostSimilarItemsConf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem); outputSimilarityMatrix.waitForCompletion(true); } } //start the multiplication of the co-occurrence matrix by the user vectors if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job partialMultiply = new Job(getConf(), "partialMultiply"); Configuration partialMultiplyConf = partialMultiply.getConfiguration(); MultipleInputs.addInputPath(partialMultiply, similarityMatrixPath, SequenceFileInputFormat.class, SimilarityMatrixRowWrapperMapper.class); MultipleInputs.addInputPath(partialMultiply, new Path(prepPath, PreparePreferenceMatrixJob.USER_VECTORS), SequenceFileInputFormat.class, UserVectorSplitterMapper.class); partialMultiply.setJarByClass(ToVectorAndPrefReducer.class); partialMultiply.setMapOutputKeyClass(VarIntWritable.class); partialMultiply.setMapOutputValueClass(VectorOrPrefWritable.class); partialMultiply.setReducerClass(ToVectorAndPrefReducer.class); partialMultiply.setOutputFormatClass(SequenceFileOutputFormat.class); partialMultiply.setOutputKeyClass(VarIntWritable.class); partialMultiply.setOutputValueClass(VectorAndPrefsWritable.class); partialMultiplyConf.setBoolean("mapred.compress.map.output", true); partialMultiplyConf.set("mapred.output.dir", partialMultiplyPath.toString()); if (usersFile != null) { partialMultiplyConf.set(UserVectorSplitterMapper.USERS_FILE, usersFile); } if (userItemFile != null) { partialMultiplyConf.set(IDReader.USER_ITEM_FILE, userItemFile); } partialMultiplyConf.setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, maxPrefsPerUser); boolean succeeded = partialMultiply.waitForCompletion(true); if (!succeeded) { return -1; } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { //filter out any users we don't care about /* convert the user/item pairs to filter if a filterfile has been specified */ if (filterFile != null) { Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class, ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class, ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class, SequenceFileOutputFormat.class); boolean succeeded = itemFiltering.waitForCompletion(true); if (!succeeded) { return -1; } } String aggregateAndRecommendInput = partialMultiplyPath.toString(); if (filterFile != null) { aggregateAndRecommendInput += "," + explicitFilterPath; } Class<? extends OutputFormat> outputFormat = parsedArgs.containsKey("--sequencefileOutput") ? SequenceFileOutputFormat.class : TextOutputFormat.class; //extract out the recommendations Job aggregateAndRecommend = prepareJob(new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class, PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class, AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class, outputFormat); Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration(); if (itemsFile != null) { aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile); } if (userItemFile != null) { aggregateAndRecommendConf.set(IDReader.USER_ITEM_FILE, userItemFile); } if (filterFile != null) { setS3SafeCombinedInputPath(aggregateAndRecommend, getTempPath(), partialMultiplyPath, explicitFilterPath); } setIOSort(aggregateAndRecommend); aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH, new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString()); aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations); aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData); boolean succeeded = aggregateAndRecommend.waitForCompletion(true); if (!succeeded) { return -1; } } return 0; }
From source file:org.gpfvic.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();/*w w w . ja va 2 s. c o m*/ addOutputOption(); addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, " + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')'); addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number " + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM)); addOption("maxPrefs", "mppu", "max number of preferences to consider per user or item, " + "users or items with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS + ')', String.valueOf(DEFAULT_MAX_PREFS)); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE)); addOption("threshold", "tr", "discard item pairs with a similarity value below this", false); addOption("randomSeed", null, "use this seed for sampling", false); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } String similarityClassName = getOption("similarityClassname"); int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem")); int maxPrefs = Integer.parseInt(getOption("maxPrefs")); int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser")); boolean booleanData = Boolean.valueOf(getOption("booleanData")); double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD; long randomSeed = hasOption("randomSeed") ? Long.parseLong(getOption("randomSeed")) : RowSimilarityJob.NO_FIXED_RANDOM_SEED; Path similarityMatrixPath = getTempPath("similarityMatrix"); Path prepPath = getTempPath("prepareRatingMatrix"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(), "--minPrefsPerUser", String.valueOf(minPrefsPerUser), "--booleanData", String.valueOf(booleanData), "--tempDir", getTempPath().toString(), }); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf()); ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output", similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName, "--maxObservationsPerRow", String.valueOf(maxPrefs), "--maxObservationsPerColumn", String.valueOf(maxPrefs), "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem), "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--randomSeed", String.valueOf(randomSeed), "--tempDir", getTempPath().toString(), }); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class, MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class, MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class); Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration(); mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString()); mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem); boolean succeeded = mostSimilarItems.waitForCompletion(true); if (!succeeded) { return -1; } } return 0; }
From source file:org.hf.mls.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { addInputOption();//from w w w. j a v a2s . c o m addOutputOption(); addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, " + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')'); addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number " + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM)); addOption("maxPrefsPerUser", "mppu", "max number of preferences to consider per user, " + "users with more preferences will be sampled down (default: " + DEFAULT_MAX_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MAX_PREFS_PER_USER)); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE)); addOption("threshold", "tr", "discard item pairs with a similarity value below this", false); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } String similarityClassName = getOption("similarityClassname"); int maxSimilarItemsPerItem = Integer.parseInt(getOption("maxSimilaritiesPerItem")); int maxPrefsPerUser = Integer.parseInt(getOption("maxPrefsPerUser")); int minPrefsPerUser = Integer.parseInt(getOption("minPrefsPerUser")); boolean booleanData = Boolean.valueOf(getOption("booleanData")); double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : RowSimilarityJob.NO_THRESHOLD; Path similarityMatrixPath = getTempPath("similarityMatrix"); Path prepPath = getTempPath("prepareRatingMatrix"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[] { "--input", getInputPath().toString(), "--output", prepPath.toString(), "--maxPrefsPerUser", String.valueOf(maxPrefsPerUser), "--minPrefsPerUser", String.valueOf(minPrefsPerUser), "--booleanData", String.valueOf(booleanData), "--tempDir", getTempPath().toString(), }); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { int numberOfUsers = HadoopUtil.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf()); ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] { "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(), "--output", similarityMatrixPath.toString(), "--numberOfColumns", String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName, "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem), "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE), "--threshold", String.valueOf(threshold), "--tempDir", getTempPath().toString(), }); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class, MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class, MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class); Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration(); mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString()); mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem); boolean succeeded = mostSimilarItems.waitForCompletion(true); if (!succeeded) { return -1; } } return 0; }
From source file:org.hf.mls.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob.java
License:Apache License
public Map<String, ControlledJob> getJobs(String[] args) throws Exception { Map<String, ControlledJob> cJobs = new HashMap<String, ControlledJob>(); ControlledJob cNormsAndTranspose = null; ControlledJob cPairwiseSimilarity = null; ControlledJob cAsMatrix = null;/* w w w . j a v a 2 s . c o m*/ addInputOption(); addOutputOption(); addOption("numberOfColumns", "r", "Number of columns in the input matrix", false); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')'); addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW)); addOption("excludeSelfSimilarity", "ess", "compute similarity of rows to themselves?", String.valueOf(false)); addOption("threshold", "tr", "discard row pairs with a similarity value below this", false); addOption(DefaultOptionCreator.overwriteOption().create()); Map<String, List<String>> parsedArgs = parseArguments(args); if (parsedArgs == null) { return null; } String similarityClassnameArg = getOption("similarityClassname"); String similarityClassname; try { similarityClassname = VectorSimilarityMeasures.valueOf(similarityClassnameArg).getClassname(); } catch (IllegalArgumentException iae) { similarityClassname = similarityClassnameArg; } // Clear the output and temp paths if the overwrite option has been set if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) { // Clear the temp path HadoopUtil.delete(getConf(), getTempPath()); // Clear the output path HadoopUtil.delete(getConf(), getOutputPath()); } int maxSimilaritiesPerRow = Integer.parseInt(getOption("maxSimilaritiesPerRow")); boolean excludeSelfSimilarity = Boolean.parseBoolean(getOption("excludeSelfSimilarity")); double threshold = hasOption("threshold") ? Double.parseDouble(getOption("threshold")) : NO_THRESHOLD; Path weightsPath = getTempPath("weights"); Path normsPath = getTempPath("norms.bin"); Path numNonZeroEntriesPath = getTempPath("numNonZeroEntries.bin"); Path maxValuesPath = getTempPath("maxValues.bin"); Path pairwiseSimilarityPath = getTempPath("pairwiseSimilarity"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job normsAndTranspose = prepareJob(new Path(getInputPath(), "ratingMatrix"), weightsPath, VectorNormMapper.class, IntWritable.class, VectorWritable.class, MergeVectorsReducer.class, IntWritable.class, VectorWritable.class); normsAndTranspose.setCombinerClass(MergeVectorsCombiner.class); Configuration normsAndTransposeConf = normsAndTranspose.getConfiguration(); normsAndTransposeConf.set(THRESHOLD, String.valueOf(threshold)); normsAndTransposeConf.set(NORMS_PATH, normsPath.toString()); normsAndTransposeConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString()); normsAndTransposeConf.set(MAXVALUES_PATH, maxValuesPath.toString()); normsAndTransposeConf.set(SIMILARITY_CLASSNAME, similarityClassname); /** * this job is depending the last job --- countObservations */ cNormsAndTranspose = new ControlledJob(new Configuration()); cNormsAndTranspose.setJob(normsAndTranspose); cJobs.put("normsAndTranspose", cNormsAndTranspose); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, CooccurrencesMapper.class, IntWritable.class, VectorWritable.class, SimilarityReducer.class, IntWritable.class, VectorWritable.class); pairwiseSimilarity.setCombinerClass(VectorSumReducer.class); Configuration pairwiseConf = pairwiseSimilarity.getConfiguration(); pairwiseConf.set(THRESHOLD, String.valueOf(threshold)); pairwiseConf.set(NORMS_PATH, normsPath.toString()); pairwiseConf.set(NUM_NON_ZERO_ENTRIES_PATH, numNonZeroEntriesPath.toString()); pairwiseConf.set(MAXVALUES_PATH, maxValuesPath.toString()); pairwiseConf.set(SIMILARITY_CLASSNAME, similarityClassname); //add prePath pairwiseConf.set("prepPath", getInputPath().toString()); pairwiseConf.setBoolean(EXCLUDE_SELF_SIMILARITY, excludeSelfSimilarity); /** * depending on normsAndTranspose job */ cPairwiseSimilarity = new ControlledJob(new Configuration()); cPairwiseSimilarity.setJob(pairwiseSimilarity); if (null != cNormsAndTranspose) { cPairwiseSimilarity.addDependingJob(cNormsAndTranspose); } cJobs.put("pairwiseSimilarity", cPairwiseSimilarity); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job asMatrix = prepareJob(pairwiseSimilarityPath, getOutputPath(), UnsymmetrifyMapper.class, IntWritable.class, VectorWritable.class, MergeToTopKSimilaritiesReducer.class, IntWritable.class, VectorWritable.class); asMatrix.setCombinerClass(MergeToTopKSimilaritiesReducer.class); asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow); /** * depending on pairwiseSimilarity job */ cAsMatrix = new ControlledJob(new Configuration()); cAsMatrix.setJob(asMatrix); if (null != cPairwiseSimilarity) { cAsMatrix.addDependingJob(cPairwiseSimilarity); } cJobs.put("asMatrix", cAsMatrix); } return cJobs; }